### Python pandas library - storing and manipulating data in "dataframes" (tables)

In [1]:
import pandas as pd

In [2]:
# For compatibility across multiple platforms
import os
IB = os.environ.get('INSTABASE_URI',None) is not None
open = ib.open if IB else open

### Reading from CSV file into dataframe

In [3]:
f = open('Cities.csv','rU')
cities = pd.read_csv(f)

In [4]:
cities

Unnamed: 0,city,country,latitude,longitude,temperature
0,Aalborg,Denmark,57.03,9.92,7.52
1,Aberdeen,United Kingdom,57.17,-2.08,8.10
2,Abisko,Sweden,63.35,18.83,0.20
3,Adana,Turkey,36.99,35.32,18.67
4,Albacete,Spain,39.00,-1.87,12.62
5,Algeciras,Spain,36.13,-5.47,17.38
6,Amiens,France,49.90,2.30,10.17
7,Amsterdam,Netherlands,52.35,4.92,8.93
8,Ancona,Italy,43.60,13.50,13.52
9,Andorra,Andorra,42.50,1.52,9.60


In [5]:
# Number of rows
len(cities)

213

In [6]:
# First few rows
cities.head()

Unnamed: 0,city,country,latitude,longitude,temperature
0,Aalborg,Denmark,57.03,9.92,7.52
1,Aberdeen,United Kingdom,57.17,-2.08,8.1
2,Abisko,Sweden,63.35,18.83,0.2
3,Adana,Turkey,36.99,35.32,18.67
4,Albacete,Spain,39.0,-1.87,12.62


In [7]:
# Last 20 rows
cities.tail(20)

Unnamed: 0,city,country,latitude,longitude,temperature
193,Tartu,Estonia,58.38,26.71,4.36
194,Tekirdag,Turkey,40.99,27.51,13.02
195,Toulouse,France,43.62,1.45,10.25
196,Trabzon,Turkey,40.98,39.72,10.28
197,Trieste,Italy,45.65,13.8,11.21
198,Trikala,Greece,39.56,21.77,16.0
199,Trondheim,Norway,63.42,10.42,4.53
200,Turku,Finland,60.45,22.25,4.72
201,Uppsala,Sweden,59.86,17.64,4.17
202,Valencia,Spain,39.49,-0.4,16.02


### Sorting, selecting rows and columns

In [8]:
# Sorting by country then descending temperature
cities.sort_values(['country','temperature'],ascending=[True,False])

Unnamed: 0,city,country,latitude,longitude,temperature
78,Elbasan,Albania,41.12,20.08,15.18
9,Andorra,Andorra,42.50,1.52,9.60
203,Vienna,Austria,48.20,16.37,7.86
95,Graz,Austria,47.08,15.41,6.91
125,Linz,Austria,48.32,14.29,6.79
175,Salzburg,Austria,47.81,13.04,4.62
105,Innsbruck,Austria,47.28,11.41,4.54
47,Brest,Belarus,52.10,23.70,6.73
161,Pinsk,Belarus,52.13,26.09,6.42
138,Mazyr,Belarus,52.05,29.27,6.25


In [9]:
# Selecting a single column - returns a 'series'
cities.city
# Also show cities['city'], cities['temperature'], cities.temperature

0           Aalborg
1          Aberdeen
2            Abisko
3             Adana
4          Albacete
5         Algeciras
6            Amiens
7         Amsterdam
8            Ancona
9           Andorra
10           Angers
11           Ankara
12          Antalya
13             Arad
14           Athens
15         Augsburg
16            Bacau
17          Badajoz
18        Baia Mare
19            Balti
20        Barcelona
21             Bari
22            Basel
23           Batman
24          Belfast
25         Belgrade
26          Bergamo
27           Bergen
28           Berlin
29        Bialystok
           ...     
183           Split
184    Stara Zagora
185       Stavanger
186       Stockholm
187            Sumy
188         Swansea
189          Szeged
190         Tallinn
191         Tampere
192          Tarsus
193           Tartu
194        Tekirdag
195        Toulouse
196         Trabzon
197         Trieste
198         Trikala
199       Trondheim
200           Turku
201         Uppsala


In [10]:
# Selecting multiple columns - returns a dataframe
cities[['city','temperature']]
# Also show cities[['city']]

Unnamed: 0,city,temperature
0,Aalborg,7.52
1,Aberdeen,8.10
2,Abisko,0.20
3,Adana,18.67
4,Albacete,12.62
5,Algeciras,17.38
6,Amiens,10.17
7,Amsterdam,8.93
8,Ancona,13.52
9,Andorra,9.60


In [11]:
# Selecting rows based on condition
# Note: no need to do type conversion - pandas infers types for columns
cities[cities.longitude < 0]

Unnamed: 0,city,country,latitude,longitude,temperature
1,Aberdeen,United Kingdom,57.17,-2.08,8.1
4,Albacete,Spain,39.0,-1.87,12.62
5,Algeciras,Spain,36.13,-5.47,17.38
10,Angers,France,47.48,-0.53,10.98
17,Badajoz,Spain,38.88,-6.97,15.61
24,Belfast,United Kingdom,54.6,-5.96,8.48
32,Bilbao,Spain,43.25,-2.93,11.41
33,Birmingham,United Kingdom,52.47,-1.92,8.81
34,Blackpool,United Kingdom,53.83,-3.05,9.15
38,Bordeaux,France,44.85,-0.6,11.87


In [12]:
# Selecting rows by number
cities[15:20]
# Show cities[:8] and cities[200:]

Unnamed: 0,city,country,latitude,longitude,temperature
15,Augsburg,Germany,48.35,10.9,4.54
16,Bacau,Romania,46.58,26.92,7.51
17,Badajoz,Spain,38.88,-6.97,15.61
18,Baia Mare,Romania,47.66,23.58,8.87
19,Balti,Moldova,47.76,27.91,8.23


In [13]:
# Putting it together: selecting rows, selecting columns, sorting:
# City and longitude of all cities with latitude > 50 and
# temperature > 9, sorted by longitude
temp1 = cities[(cities.latitude > 50) & (cities.temperature > 9)]
temp2 = temp1[['city','longitude']]
temp3 = temp2.sort_values('longitude')
temp3
# Show eliminating temp3, then temp2, then temp1 (use \ for long lines)
# Note similar functionality to SQL

Unnamed: 0,city,longitude
88,Galway,-9.05
67,Cork,-8.5
188,Swansea,-3.95
84,Exeter,-3.53
34,Blackpool,-3.05
40,Bournemouth,-1.9
58,Cambridge,0.12
123,Lille,3.08
49,Brugge,3.23


### <font color="green">Your Turn</font>

In [14]:
# Read the Countries.csv file into a dataframe
f = open('Countries.csv','rU')
countries = pd.read_csv(f)

In [15]:
# Find all countries that are not in the EU and don't
# have coastline, together with their populations,
# sorted by population (smallest to largest)



SyntaxError: invalid syntax (<ipython-input-15-ff6d6631e80d>, line 4)

### Aggregation

In [None]:
# Minimum and maximum temperature
print 'Minimum temperature:', min(cities.temperature)
print 'Maximum temperature:', max(cities.temperature)

In [None]:
# Average temperature
print 'Using sum/count:', sum(cities.temperature)/len(cities.temperature)
import numpy as np
print 'Using numpy:', np.average(cities.temperature)
print 'Using built-in mean:', cities.temperature.mean()

In [None]:
# Average temperature of cities in each country
cities.groupby('country').mean().temperature
# or [['temperature']]
# Also show without column selection

### <font color="green">Your Turn</font>

In [None]:
# Find the average population of countries with coastline
# and countries without coastline
# Hint: You can use groupby!

In [None]:
# Then modify to group by both coastline and EU

### Joining

In [None]:
cities.merge(countries, on='country')

In [None]:
# Joining is symmetric
countries.merge(cities, on='country')

### Miscellaneous features

In [None]:
# String operations - countries with 'ia' in their name
countries[countries.country.str.contains('ia')]

In [None]:
# Plotting
%matplotlib inline
cities.plot.scatter(x='latitude', y='temperature')

In [None]:
# Add fahrenheit column
cities['fahrenheit'] = (cities.temperature * 9/5) + 32
cities

### Reminders

In [None]:
# "Queries" - only last result shown
cities[cities.longitude > 35]
cities[cities.longitude < -5]

In [None]:
# Assignment to temporary dataframes
east = cities[cities.longitude > 35]
west = cities[cities.longitude < -5]
east

### <font color="green">Your Turn: World Cup data</font>

In [17]:
# Read the Players and Teams data into dataframes
f = open('Players.csv','rU')
players = pd.read_csv(f)
f = open('Teams.csv','rU')
teams = pd.read_csv(f)

In [18]:
# What player on a team with “ia” in the team name played less than
# 200 minutes and made more than 100 passes? Print the player surname.

nw_players = players.merge(teams, on='team')
nw_players

Unnamed: 0,surname,team,position,minutes,shots,passes,tackles,saves,ranking,games,wins,draws,losses,goalsFor,goalsAgainst,yellowCards,redCards
0,Abdoun,Algeria,midfielder,16,0,6,0,0,30,3,0,1,2,0,2,4,2
1,Belhadj,Algeria,defender,270,1,146,8,0,30,3,0,1,2,0,2,4,2
2,Boudebouz,Algeria,midfielder,74,3,28,1,0,30,3,0,1,2,0,2,4,2
3,Bougherra,Algeria,defender,270,1,89,11,0,30,3,0,1,2,0,2,4,2
4,Chaouchi,Algeria,goalkeeper,90,0,17,0,2,30,3,0,1,2,0,2,4,2
5,Djebbour,Algeria,forward,123,3,19,1,0,30,3,0,1,2,0,2,4,2
6,Ghezzal,Algeria,forward,40,3,8,0,0,30,3,0,1,2,0,2,4,2
7,Guedioura,Algeria,midfielder,38,0,18,1,0,30,3,0,1,2,0,2,4,2
8,Halliche,Algeria,defender,270,2,94,4,0,30,3,0,1,2,0,2,4,2
9,Kadir,Algeria,midfielder,262,0,104,3,0,30,3,0,1,2,0,2,4,2


In [20]:
# What is the average number of passes made by forwards? By midfielders?
# Don't include any other positions in your result.
# Hint: groupby is NOT the easiest way to do this one!

n_player = nw_players[(nw_players.team.str.contains('ia')) & (nw_players.minutes < 200) & (nw_players.passes > 100)]
n_player.surname

431    Kuzmanovic
Name: surname, dtype: object

In [22]:
# Which team has the highest ratio of goalsFor to goalsAgainst?
# Print the team name only.
# Hint: Add a "ratio" column to the teams dataframe, then sort,
# then use head(1) or tail(1) depending how you sorted

f = nw_players[(nw_players.position == 'forward')]
m = nw_players[(nw_players.position == 'midfielder')]
print "Forwards: ", f.passes.mean()
print "Midfielders", m.passes.mean()

Forwards:  50.8251748252
Midfielders 95.2719298246


In [19]:
# How many players who play on a team with ranking <10 played
# more than 350 minutes?
# Reminder: len() gives number of rows in a dataframe

n_p = nw_players[(nw_players.ranking < 10) & (nw_players.minutes > 350)]
print len(n_p)

54


In [None]:
# BONUS!
# Write a loop that interactively asks the user to enter a team name.
# If the team exists, list all of the players on that team
# (with all of their information), sorted by descending minutes played.
# If the team doesn't exist, print "Team not in 2010 World Cup".
# If 'quit' is entered, terminate the loop.
# Reminder: To read a string from the user instead of a number, use
# raw_input() instead of input()
# Note: To test if a value v is (not) in a column c of a dataframe D,
# use "v (not) in D.c.values"

t_name = "team"
while t_name != "quit":
    team_name = raw_input("Enter a team name: ")
    if t_name in merged_players.team.values:
        print players[players.team == t_name]
    elif t_name != "quit":
        print "Team not in 2010 World Cup"