# Python Data Science Toolbox

In [1]:
import pandas as pd
df = pd.read_csv('../data/tweets.csv')
df.head(3)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,filter_level,geo,id,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,timestamp_ms,truncated,user
0,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [{'screen_na...","{'media': [{'sizes': {'large': {'w': 1024, 'h'...",0,False,low,,714960401759387648,...,,,0,False,"{'retweeted': False, 'text': "".@krollbondratin...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @bpolitics: .@krollbondrating's Christopher...,1459294817758,False,"{'utc_offset': 3600, 'profile_image_url_https'..."
1,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [{'text': 'cruzsexscandal', 'indi...","{'media': [{'sizes': {'large': {'w': 500, 'h':...",0,False,low,,714960401977319424,...,,,0,False,"{'retweeted': False, 'text': '@dmartosko Cruz ...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @HeidiAlpine: @dmartosko Cruz video found.....,1459294817810,False,"{'utc_offset': None, 'profile_image_url_https'..."
2,,,Tue Mar 29 23:40:17 +0000 2016,"{'hashtags': [], 'user_mentions': [], 'symbols...",,0,False,low,,714960402426236928,...,,,0,False,,"<a href=""http://www.facebook.com/twitter"" rel=...",Njihuni me ZonjÃ«n Trump !!! | Ekskluzive http...,1459294817917,False,"{'utc_offset': 7200, 'profile_image_url_https'..."


## 1.Writing your own functions
User-defined functions

In [2]:
def shout(word):
    shout_word = word + '!!!'
    return(shout_word)

yell = shout("congratulations")
print(yell)

congratulations!!!


Multiple parameters and return values

In [3]:
def shout(word1, word2):
    return (word1 + " " + word2 + "!!!")

yell = shout("congratulations", "you")
print(yell)

congratulations you!!!


In [4]:
def shout_all(word1, word2): 
    shout1 = word1 + '!!!'
    shout2 = word2 + '!!!'
    shout_words = (shout1,shout2)
    return shout_words

yell1, yell2 = shout_all('congratulations','you')
print(yell1)
print(yell2)

congratulations!!!
you!!!


Bringing it all together

In [5]:
def count_entries(df, col_name):
    langs_count = {}
    col = df[col_name]
    for entry in col:
        if entry in langs_count.keys():
            langs_count[entry] += 1
        else:
            langs_count[entry] = 1
    return langs_count

result = count_entries(df,'lang')
print(result)

{'en': 97, 'et': 1, 'und': 2}


## 2. Default arguments, variable-length arguments and scope
Scope and user-defined functions

In [6]:
team = "teen titans"


def change_team():
    global team
    team = "justice league"


print(team)
change_team()
print(team)

teen titans
justice league


Nested functions

In [7]:
def three_shouts(word1, word2, word3):
    def inner(word):
        return word + "!!!"

    return inner(word1), inner(word2), inner(word3)

print(three_shouts("congratulations", "you", "me"))

('congratulations!!!', 'you!!!', 'me!!!')


In [8]:
def echo_shout(word):
    echo_word = word + word
    print(echo_word)

    def shout():
        nonlocal echo_word
        echo_word = echo_word + "!!!"

    shout()
    print(echo_word)

echo_shout("Hi")

HiHi
HiHi!!!


Default and flexible arguments

In [9]:
def shout_echo(word1, echo=1, intense=False):
    echo_word = word1 * echo
    if intense:
        echo_word_new = echo_word.upper() + "!!!"
    else:
        echo_word_new = echo_word + "!!!"
    return echo_word_new
    
with_big_echo = shout_echo("Hey", 5, True)
big_no_echo = shout_echo("Bye", intense=True)

print(with_big_echo)
print(big_no_echo)

HEYHEYHEYHEYHEY!!!
BYE!!!


In [10]:
def gibberish(*args):
    hodgepodge = ''
    for word in args:
        hodgepodge += word
    return hodgepodge

one_word = gibberish("word")
many_words = gibberish("one", "two", "three")

print(one_word)
print(many_words)

word
onetwothree


In [11]:
def report_status(**kwargs):
    print("\nBEGIN: REPORT\n")
    for key, value in kwargs.items():
        print(key + ": " + value)
    print("\nEND REPORT")

report_status(name='Adham', age='23', job="programmer")
report_status(name='Aya', job="teacher")


BEGIN: REPORT

name: Adham
age: 23
job: programmer

END REPORT

BEGIN: REPORT

name: Aya
job: teacher

END REPORT


Bringing it all together

In [12]:
def count_entries(df, col_name='lang'):
    cols_count = {}
    col = df[col_name]

    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
    return cols_count

result1 = count_entries(df)
return2 = count_entries(df, 'source')

print(result1)
print(return2)


{'en': 97, 'et': 1, 'und': 2}
{'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 24, '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>': 1, '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 26, '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 33, '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>': 2, '<a href="http://www.google.com/" rel="nofollow">Google</a>': 2, '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 6, '<a href="http://linkis.com" rel="nofollow">Linkis.com</a>': 2, '<a href="http://rutracker.org/forum/viewforum.php?f=93" rel="nofollow">newzlasz</a>': 2, '<a href="http://ifttt.com" rel="nofollow">IFTTT</a>': 1, '<a href="http://www.myplume.com/" rel="nofollow">PlumeÂ forÂ Android</a>': 1}


In [13]:
def count_entries(df, *args):
    cols_count = {}
    for col_name in args:
        col = df[col_name]
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
    return cols_count

result1 = count_entries(df, 'lang')
result2 = count_entries(df, 'lang', 'source')

print(result1)
print('----------')
print(result2)

{'en': 97, 'et': 1, 'und': 2}
----------
{'en': 97, 'et': 1, 'und': 2, '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 24, '<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>': 1, '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>': 26, '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>': 33, '<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>': 2, '<a href="http://www.google.com/" rel="nofollow">Google</a>': 2, '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>': 6, '<a href="http://linkis.com" rel="nofollow">Linkis.com</a>': 2, '<a href="http://rutracker.org/forum/viewforum.php?f=93" rel="nofollow">newzlasz</a>': 2, '<a href="http://ifttt.com" rel="nofollow">IFTTT</a>': 1, '<a href="http://www.myplume.com/" rel="nofollow">PlumeÂ forÂ Android</a>': 1}


## 3. Lambda functions and error-handling
Lambda functions

In [14]:
add_bangs = (lambda a: a + '!!!')
add_bangs("HI")

'HI!!!'

In [15]:
echo_word = (lambda word, echo: word * echo)
echo_word("Hey", 5)

'HeyHeyHeyHeyHey'

In [16]:
def square(n):
    return n * n
my_list = [2,3,4,5,6,7,8,9]
update_list = map(square, my_list)
print(update_list)
print(list(update_list))

<map object at 0x000001D50F8ED690>
[4, 9, 16, 25, 36, 49, 64, 81]


In [17]:
spells = ['protect', 'act', 'patrons', 'lawful']
shout_spells = map(lambda item: item + '!!!', spells)
print(list(shout_spells))

['protect!!!', 'act!!!', 'patrons!!!', 'lawful!!!']


In [18]:
fellowship = [
    "frodo",
    "samwise",
    "merry",
    "pippin",
    "aragorn",
    "boromir",
    "legolas",
    "gimli",
    "gandalf",
]
result = filter(lambda a: len(a)>6 , fellowship)
print(list(result))

['samwise', 'aragorn', 'boromir', 'legolas', 'gandalf']


In [19]:
from functools import reduce

stark = ['robb', 'sansa', 'arya', 'brandon', 'rickon']
result = reduce(lambda item1, item2: item1 + item2, stark)
print(result)

robbsansaaryabrandonrickon


Introduction to error handling

In [20]:
def shout_echo(word1 , echo=1):
    echo_word = ''
    shout_words = ''
    try:
        echo_word = word1 * echo
        shout_words = echo_word + '!!!'
    except:
        print('word1 must be a string and echo must be a number.')
    return shout_words

shout_echo('particle', echo='accelerator')

word1 must be a string and echo must be a number.


''

In [21]:
def shout_echo(word1, echo=1):
    if echo<0:
        raise ValueError('echo must be greater than 0')
    
    echo_word = word1 * echo
    shout_words = echo_word + '!!!'
    return shout_words

shout_echo('particle', echo=5)

'particleparticleparticleparticleparticle!!!'

Bringing it all together

In [22]:
result = filter(lambda x: x[0:2]=="RT", df['text'])
res_list = list(result)
for tween in res_list:
    print(tween)

RT @bpolitics: .@krollbondrating's Christopher Whalen says Clinton is the weakest Dem candidate in 50 years https://t.co/pLk7rvoRSn https:/â€¦
RT @HeidiAlpine: @dmartosko Cruz video found.....racing from the scene.... #cruzsexscandal https://t.co/zuAPZfQDk3
RT @AlanLohner: The anti-American D.C. elites despise Trump for his America-first foreign policy. Trump threatens their gravy train. https:â€¦
RT @BIackPplTweets: Young Donald trump meets his neighbor  https://t.co/RFlu17Z1eE
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @HouseCracka: 29,000+ PEOPLE WATCHING TRUMP LIVE ON ONE STREAM!!!

https://t.co/7QCFz9ehNe
RT @urfavandtrump: RT for Brendon Urie
Fav for Donald Trump https://t.co/PZ5vS94lOg
RT @trapgrampa: This is how I see #Trump every time he speaks. https://t.co/fYSiHNS0nT
RT @trumpresearch: @WaitingInBagdad @thehill Trump supporters have selective amnisia.
RT @Pjw20161951: NO KIDDING: #SleazyDonald just attacked Scott Walker for NOT R

In [23]:
def count_entries(df, col_name='lang'):
    cols_count = {}
    try:
        col = df[col_name]
        for entry in col:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1

        return cols_count
    except:
        print('The DataFrame does not have a ' + col_name + ' column')

result = count_entries(df, 'lang')
print(result)

{'en': 97, 'et': 1, 'und': 2}


In [24]:
def count_entries(df, col_name='lang'):
    if col_name not in df.columns:
        raise ValueError('The DataFrame does not have a ' + col_name + ' column')
    cols_count = {}
    col = df[col_name]

    for entry in col:
        if entry in cols_count.keys():
            cols_count[entry] += 1
        else:
            cols_count[entry] = 1
    return cols_count

result = count_entries(df, col_name='lang')
print(result)

{'en': 97, 'et': 1, 'und': 2}


In [25]:
def count_entries(df, col_name="lang"):
    if col_name not in df.columns:
        raise ValueError("The DataFrame does not have a " + col_name + " column")
    cols_count = {}

    try:
        for entry in df[col_name]:
            if entry in cols_count.keys():
                cols_count[entry] += 1
            else:
                cols_count[entry] = 1
        return cols_count
    except:
        print("The DataFrame does not have a " + col_name + " column")


result = count_entries(df, col_name="lang")
print(result)

{'en': 97, 'et': 1, 'und': 2}


In [26]:
world_blank = pd.read_csv('../data/world_bank.csv')
world_blank.head(3)

Unnamed: 0,CountryName,CountryCode,Year,Total Population,Urban population (% of total)
0,Arab World,ARB,1960,92495902.0,31.285384
1,Caribbean small states,CSS,1960,4190810.0,31.59749
2,Central Europe and the Baltics,CEB,1960,91401583.0,44.507921


## 4. Using iterators in PythonLand
Introduction to iterators

In [27]:
flash = ["jay garrick", "barry allen", "wally west", "bart allen"]

for element in flash:
    print("**", element)

superhero = iter(flash)

print(next(superhero))
print(next(superhero))
print(next(superhero))

** jay garrick
** barry allen
** wally west
** bart allen
jay garrick
barry allen
wally west


In [28]:
small_value = iter(range(3))
print("-", next(small_value))
print("-", next(small_value))
print("-", next(small_value))

for num in range(3):
    print(num)

googol = iter(range(10**100))
print("**", next(googol))
print("**", next(googol))
print("**", next(googol))
print("**", next(googol))
print("**", next(googol))

- 0
- 1
- 2
0
1
2
** 0
** 1
** 2
** 3
** 4


In [29]:
word = 'Data'
it = iter(word)
print(*it)

D a t a


In [30]:
pythonistas = {'hugo': 'bowne-anderson', 'francis': 'castro'}
for key, value in pythonistas.items():    
    print(key, value)

hugo bowne-anderson
francis castro


In [31]:
mutants = [
    "charles xavier",
    "bobby drake",
    "kurt wagner",
    "max eisenhardt",
    "kitty pryde",
]

aliases = ["prof x", "iceman", "nightcrawler", "magneto", "shadowcat"]
powers = [
    "telepathy",
    "thermokinesis",
    "teleportation",
    "magnetokinesis",
    "intangibility",
]

mutant_list = list(enumerate(mutants))
print(mutant_list)

for index, value in enumerate(mutants):
    print(">", index, value)

for index, value in enumerate(mutants, 1):
    print(index, value)

[(0, 'charles xavier'), (1, 'bobby drake'), (2, 'kurt wagner'), (3, 'max eisenhardt'), (4, 'kitty pryde')]
> 0 charles xavier
> 1 bobby drake
> 2 kurt wagner
> 3 max eisenhardt
> 4 kitty pryde
1 charles xavier
2 bobby drake
3 kurt wagner
4 max eisenhardt
5 kitty pryde


In [32]:
mutant_data = list(zip(mutants,aliases,powers))
print(mutant_data)

mutant_zip = zip(mutants,aliases,powers)
print(mutant_zip)

for value1, value2, value3 in mutant_zip:
    print(value1, value2, value3)

[('charles xavier', 'prof x', 'telepathy'), ('bobby drake', 'iceman', 'thermokinesis'), ('kurt wagner', 'nightcrawler', 'teleportation'), ('max eisenhardt', 'magneto', 'magnetokinesis'), ('kitty pryde', 'shadowcat', 'intangibility')]
<zip object at 0x000001D50F90FD00>
charles xavier prof x telepathy
bobby drake iceman thermokinesis
kurt wagner nightcrawler teleportation
max eisenhardt magneto magnetokinesis
kitty pryde shadowcat intangibility


In [33]:
z1 = zip(mutants, powers)
print(*z1)

z1 = zip(mutants, powers)
result1, result2 = zip(*z1)
print(result1 == mutants)
print(result2 == powers)

('charles xavier', 'telepathy') ('bobby drake', 'thermokinesis') ('kurt wagner', 'teleportation') ('max eisenhardt', 'magnetokinesis') ('kitty pryde', 'intangibility')
False
False


Using iterators to load large files into memory

In [34]:
def count_entries(csv_file, c_size, colname):
    counts_dict = {}

    for chunk in pd.read_csv(csv_file, chunksize=c_size):
        for entry in chunk[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1
    return counts_dict

result = count_entries('../data/tweets.csv', 10, 'lang')
print(result)

{'en': 97, 'et': 1, 'und': 2}


## 5. List comprehensions and generators

List comprehensions

In [35]:
nums = [12, 8, 21, 3, 16]
new_nums = []
for num in nums:
    new_nums.append(num + 1)

print("BEFORE -----")
print(new_nums, '\n')

new_nums = [num + 1 for num in nums]
print("AFTER -----")
print(new_nums)

BEFORE -----
[13, 9, 22, 4, 17] 

AFTER -----
[13, 9, 22, 4, 17]


In [36]:
[i**2 for i in range(10)]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [37]:
matrix = [[col for col in range(5)] for row in range(5)]
for row in matrix:
    print(row)

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


Advanced comprehensions

[ output expression for iterator variable in iterable if predicate expression ]

In [38]:
fellowship = ["frodo", "samwise", "merry", "aragorn", "legolas", "boromir", "gimli"]
new_fellowship = [member for member in fellowship if len(member) >= 7]
print(new_fellowship)

['samwise', 'aragorn', 'legolas', 'boromir']


In [39]:
[num ** 2 if num % 2 == 0 else 0 for num in range(10)]

[0, 0, 4, 0, 16, 0, 36, 0, 64, 0]

In [40]:
new_fellowship = [member if len(member) >= 7 else '' for member in fellowship]
print(new_fellowship)

['', 'samwise', '', 'aragorn', 'legolas', 'boromir', '']


In [41]:
new_fellowship = {member : len(member) for member in fellowship}
print(new_fellowship)

{'frodo': 5, 'samwise': 7, 'merry': 5, 'aragorn': 7, 'legolas': 7, 'boromir': 7, 'gimli': 5}


Introduction to generator expressions

In [42]:
[num ** 2 for num in range(10)]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [43]:
result = (num for num in range(11))

print("-", next(result))
print("-", next(result))

for num in result:
    print(num)

- 0
- 1
2
3
4
5
6
7
8
9
10


In [44]:
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

lengths = (len(person) for person in lannister)
for value in lengths:
    print(value)

6
5
5
6
7


In [45]:
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']

def get_lenths(inpput_list):
    for person in inpput_list:
        yield len(person)

for value in get_lenths(lannister):
    print(value)

6
5
5
6
7


Wrapping up comprehensions and generators

In [46]:
tweet_time = df['created_at']

tweet_clock_time = [entry[11:19] for entry in tweet_time]
print(tweet_clock_time)

['23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:17', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:17', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:18', '23:40:19', '23:40:18', '23:40:18', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:18', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23

In [47]:
tweet_time = df['created_at']

tweet_clock_time = [entry[11:19] for entry in tweet_time if entry[17:19] == '19']
print(tweet_clock_time)

['23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19', '23:40:19']


## 6. Bringing it all together!

In [48]:
feature_names = ['CountryName','CountryCode', 'IndicatorName', 'IndicatorCode', 'Year', 'Value']
row_vals = ['Arab World','ARB', 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'SP.ADO.TFRT',
 '1960','133.56090740552298']

In [49]:
def lists2dic(list1, list2):
    zipped_lists = zip(list1, list2)
    return dict(zipped_lists)

result = lists2dic(feature_names, row_vals)
print(result)

{'CountryName': 'Arab World', 'CountryCode': 'ARB', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'Year': '1960', 'Value': '133.56090740552298'}


In [50]:
row_lists = [['Arab World',
  'ARB',
  'Adolescent fertility rate (births per 1,000 women ages 15-19)',
  'SP.ADO.TFRT',
  '1960',
  '133.56090740552298'],
 ['Arab World',
  'ARB',
  'Age dependency ratio (% of working-age population)',
  'SP.POP.DPND',
  '1960',
  '87.7976011532547'],
 ['Arab World',
  'ARB',
  'Age dependency ratio, old (% of working-age population)',
  'SP.POP.DPND.OL',
  '1960',
  '6.634579191565161'],
 ['Arab World',
  'ARB',
  'Age dependency ratio, young (% of working-age population)',
  'SP.POP.DPND.YG',
  '1960',
  '81.02332950839141']]

In [51]:
list_of_dicts = [lists2dic(feature_names, row) for row in row_lists]
print(list_of_dicts[0])
print(list_of_dicts[1])

{'CountryName': 'Arab World', 'CountryCode': 'ARB', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'IndicatorCode': 'SP.ADO.TFRT', 'Year': '1960', 'Value': '133.56090740552298'}
{'CountryName': 'Arab World', 'CountryCode': 'ARB', 'IndicatorName': 'Age dependency ratio (% of working-age population)', 'IndicatorCode': 'SP.POP.DPND', 'Year': '1960', 'Value': '87.7976011532547'}


In [52]:
df = pd.DataFrame(list_of_dicts)
df

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,133.56090740552298
1,Arab World,ARB,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,87.7976011532547
2,Arab World,ARB,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,6.634579191565161
3,Arab World,ARB,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,81.02332950839141


Using Python generators for streaming data

In [53]:
with open('../data/world_bank.csv') as file:
    file.readline()
    counts_dict = {}

    for j in range(10):
        line = file.readline().split(',')
        first_col = line[0]
        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1

counts_dict

{'Arab World': 1,
 'Caribbean small states': 1,
 'Central Europe and the Baltics': 1,
 'East Asia & Pacific (all income levels)': 1,
 'East Asia & Pacific (developing only)': 1,
 'Euro area': 1,
 'Europe & Central Asia (all income levels)': 1,
 'Europe & Central Asia (developing only)': 1,
 'European Union': 1,
 'Fragile and conflict affected situations': 1}

In [54]:
def read_large_file(file_object):
    while True:
        data = file_object.readline()
        if not data:
            break
        yield data

with open('../data/world_bank.csv') as file:
    gen_file = read_large_file(file)
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))

CountryName,CountryCode,Year,Total Population,Urban population (% of total)

Arab World,ARB,1960,92495902.0,31.285384211605397

Caribbean small states,CSS,1960,4190810.0,31.5974898513652



In [55]:
counts_dict = {}

with open('../data/world_bank.csv') as file:
    for line in read_large_file(file):
        row = line.split(',')
        first_col = row[0]

        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1
    
counts_dict

{'CountryName': 1,
 'Arab World': 55,
 'Caribbean small states': 55,
 'Central Europe and the Baltics': 55,
 'East Asia & Pacific (all income levels)': 55,
 'East Asia & Pacific (developing only)': 55,
 'Euro area': 55,
 'Europe & Central Asia (all income levels)': 55,
 'Europe & Central Asia (developing only)': 55,
 'European Union': 55,
 'Fragile and conflict affected situations': 55,
 'Heavily indebted poor countries (HIPC)': 55,
 'High income': 55,
 'High income: nonOECD': 55,
 'High income: OECD': 55,
 'Latin America & Caribbean (all income levels)': 55,
 'Latin America & Caribbean (developing only)': 55,
 'Least developed countries: UN classification': 55,
 'Low & middle income': 55,
 'Low income': 55,
 'Lower middle income': 55,
 'Middle East & North Africa (all income levels)': 55,
 'Middle East & North Africa (developing only)': 55,
 'Middle income': 55,
 'North America': 55,
 'OECD members': 55,
 'Other small states': 55,
 'Pacific island small states': 55,
 'Small states': 5

Using pandas' read_csv iterator for streaming data

In [56]:
df_reader = pd.read_csv('../data/world_bank.csv' , chunksize=8)
print(next(df_reader))
print(next(df_reader))

                                 CountryName CountryCode  Year  \
0                                 Arab World         ARB  1960   
1                     Caribbean small states         CSS  1960   
2             Central Europe and the Baltics         CEB  1960   
3    East Asia & Pacific (all income levels)         EAS  1960   
4      East Asia & Pacific (developing only)         EAP  1960   
5                                  Euro area         EMU  1960   
6  Europe & Central Asia (all income levels)         ECS  1960   
7    Europe & Central Asia (developing only)         ECA  1960   

   Total Population  Urban population (% of total)  
0      9.249590e+07                      31.285384  
1      4.190810e+06                      31.597490  
2      9.140158e+07                      44.507921  
3      1.042475e+09                      22.471132  
4      8.964930e+08                      16.917679  
5      2.653965e+08                      62.096947  
6      6.674890e+08               

In [57]:
urb_pop_reader = pd.read_csv('../data/world_bank.csv' , chunksize=1000)

df_urb_pop = next(urb_pop_reader)
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)'])
pop_list = list(pops)
pop_list

[(91401583.0, 44.5079211390026),
 (92237118.0, 45.206665319194),
 (93014890.0, 45.866564696018),
 (93845749.0, 46.5340927663649),
 (94722599.0, 47.2087429803526)]

In [None]:
import matplotlib.pyplot as plt

urb_pop_reader = pd.read_csv("../data/world_bank.csv", chunksize=1000)

df_urb_pop = next(urb_pop_reader)
df_pop_ceb = df_urb_pop[df_urb_pop["CountryCode"] == "CEB"]

pops = zip(df_pop_ceb["Total Population"], df_pop_ceb["Urban population (% of total)"])
pop_list = list(pops)

df_pop_ceb.loc[:, "Total Urban Population"] = [
    int(tup[0] * tup[1] * 0.01) for tup in pop_list
]
df_pop_ceb.plot(kind="scatter", x="Year", y="Total Urban Population")
plt.show()

In [None]:
urb_pop_reader = pd.read_csv("../data/world_bank.csv", chunksize=1000)

data = pd.DataFrame()

for df_urb_pop in urb_pop_reader:
    df_pop_ceb = df_urb_pop[df_urb_pop["CountryCode"] == "CEB"]
    pops = zip(df_pop_ceb["Total Population"], df_pop_ceb["Urban population (% of total)"])
    pop_list = list(pops)
    df_pop_ceb.loc[:, "Total Urban Population"] = [
        int(tup[0] * tup[1] * 0.01) for tup in pop_list
    ]
    data = pd.concat([data, df_pop_ceb])

data.plot(kind="scatter", x="Year", y="Total Urban Population")
plt.show()

In [None]:
def plot_pop(filename, country_code):
    urb_pop_reader = pd.read_csv(filename, chunksize=1000)
    data = pd.DataFrame()
    for df_urb_pop in urb_pop_reader:
        df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
        pops = zip(df_pop_ceb['Total Population'],
                    df_pop_ceb['Urban population (% of total)'])
        pops_list = list(pops)

        df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]

        data = pd.concat([data, df_pop_ceb])
    data.plot(kind='scatter', x='Year', y='Total Urban Population')
    plt.show()


fn = 'ind_pop_data.csv'

plot_pop('../data/world_bank.csv','CEB')
plot_pop('../data/world_bank.csv','ARB')