### Classification using scikit-learn (with pandas)

In [1]:
import csv
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# For compatibility across multiple platforms
import os

In [3]:
# Read Cities.csv into dataframe, add column for temperature category
# Note: For a dataframe D and integer i, D.ix[i] is the i-th row of D
f = open('Cities.csv','rU')
cities = pd.read_csv(f)
cats = []
for i in range(len(cities)):
    if cities.ix[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.ix[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.ix[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print "cold:", len(cities[(cities.category == 'cold')])
print "cool:", len(cities[(cities.category == 'cool')])
print "warm:", len(cities[(cities.category == 'warm')])
print "hot:", len(cities[(cities.category == 'hot')])

cold: 17
cool: 92
warm: 79
hot: 25


# Hola Mundo
Aquí muestro la tabla para ver los valores

In [5]:
cities.head(5)

Unnamed: 0,city,country,latitude,longitude,temperature,category
0,Aalborg,Denmark,57.03,9.92,7.52,cool
1,Aberdeen,United Kingdom,57.17,-2.08,8.1,cool
2,Abisko,Sweden,63.35,18.83,0.2,cold
3,Adana,Turkey,36.99,35.32,18.67,hot
4,Albacete,Spain,39.0,-1.87,12.62,warm


In [6]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print 'Training set', numtrain, 'items'
print'Test set', numtest, 'items'
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:numitems]

Training set 181 items
Test set 32 items


In [9]:
numtrain

181

In [7]:
citiesTrain.head(5)

Unnamed: 0,city,country,latitude,longitude,temperature,category
0,Aalborg,Denmark,57.03,9.92,7.52,cool
1,Aberdeen,United Kingdom,57.17,-2.08,8.1,cool
2,Abisko,Sweden,63.35,18.83,0.2,cold
3,Adana,Turkey,36.99,35.32,18.67,hot
4,Albacete,Spain,39.0,-1.87,12.62,warm


In [8]:
citiesTest.head(5)

Unnamed: 0,city,country,latitude,longitude,temperature,category
181,Sivas,Turkey,39.75,37.03,8.05,cool
182,Skopje,Macedonia,42.0,21.43,9.36,warm
183,Split,Croatia,43.52,16.47,12.46,warm
184,Stara Zagora,Bulgaria,42.42,25.62,10.9,warm
185,Stavanger,Norway,58.97,5.68,5.53,cool


### K-nearest-neighbors classification

In [10]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
neighbors = 8
classifier = KNeighborsClassifier(neighbors)
classifier.fit(citiesTrain[features], citiesTrain['category'])
predictions = classifier.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Comment out print, play with other values for neighbors, try 'temperature'
# as feature

Predicted: warm  Actual: cool
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cold
Predicted: cold  Actual: cold
Predicted: warm  Actual: warm
Predicted: cool  Actual: cold
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: hot  Actual: hot
Predicted: cold  Actual: cold
Predicted: cool  Actual: cold
Predicted: cool  Actual: cold
Predicted: hot  Actual: hot
Predicted: cool  Actual: cool
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Accuracy: 0.78125


In [11]:
.78*numtest

24.96

In [12]:
numtest*.22

7.04

### <font color="green">Your Turn: K-nearest-neighbors on World Cup Data</font>

In [None]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# This cell does all the set-up, including reordering the data to avoid team bias.
f = open('Players.csv','rU')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
numitems = len(players)
percenttrain = 0.95
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print 'Training set', numtrain, 'items'
print'Test set', numtest, 'items'
playersTrain = players[0:numtrain]
playersTest = players[numtrain:numitems]

In [None]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
neighbors = 10
classifier = KNeighborsClassifier(neighbors)
classifier.fit(playersTrain[features], playersTrain['position'])
predictions = classifier.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)

## Decision tree classification

In [None]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
featurevals = citiesTrain[features]
labels = citiesTrain['category']
dt = DecisionTreeClassifier(min_samples_split=10) # parameter is optional
dt.fit(featurevals,labels)
predictions = dt.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Comment out print, play with other values for min_samples_split, try 'temperature'
# as feature 

### "Forest" of decision trees

In [None]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
featurevals = citiesTrain[features]
labels = citiesTrain['category']
rf = RandomForestClassifier(n_estimators=10) # number of different decision trees
rf.fit(featurevals,labels)
predictions = rf.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Comment out print, play with other values for n_estimators

### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [None]:
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
featurevals = playersTrain[features]
labels = playersTrain['position']
dt = DecisionTreeClassifier(min_samples_split=10) # parameter is optional
dt.fit(featurevals,labels)
predictions = dt.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)

In [None]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
featurevals = playersTrain[features]
labels = playersTrain['position']
rf = RandomForestClassifier(n_estimators=10) # number of different decision trees
rf.fit(featurevals,labels)
predictions = rf.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)

### Naive Bayes classification

In [None]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
featurevals = citiesTrain[features]
labels = citiesTrain['category']
nb = GaussianNB()
nb.fit(featurevals,labels)
predictions = nb.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Comment out print, try removing 'longitude'

### <font color="green">Your Turn: Naive Bayes on World Cup Data</font>

In [None]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features. What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
featurevals = playersTrain[features]
labels = playersTrain['position']
nb = GaussianNB()
nb.fit(featurevals,labels)
predictions = nb.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)