### Regression using numpy (with pyplot)

In [None]:
import csv
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Read Cities.csv into list of dictionaries
cities = []
with open('Cities.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        cities.append(r)

In [None]:
# Introduction to plotting a line
plt.plot([1,2], [2,4], color='green')
plt.show()

In [None]:
# Create latitude versus temperature scatterplot
lats = []
temps = []
for city in cities:
    lats.append(float(city['latitude']))
    temps.append(float(city['temperature']))
plt.xlabel('latitude')
plt.ylabel('temperature')
plt.scatter(lats, temps)
plt.show()

In [None]:
# Add linear regression
plt.scatter(lats,temps)
a,b = np.polyfit(lats, temps, 1) # Regression line is y = ax + b
x1 = min(lats)
x2 = max(lats)
plt.plot([x1,x2], [a*x1 + b, a*x2 + b], color='red')
plt.show()
# Beautify using plt.xlim(x1,x2)

In [None]:
# Correlation coefficients (r values)
cc = np.corrcoef(lats,temps)[1,0]
print('Correlation coefficient for latitude versus temperature:', cc)
longs = []
for city in cities:
    longs.append(float(city['longitude']))
cc = np.corrcoef(longs,temps)[1,0]
print('Correlation coefficient for longitude versus temperature:', cc)

In [None]:
# Use linear regression for temperature predictor
# Training data: compute latitude-temperature regression from cities
# in Norway, France, and Turkey
lats = []
temps = []
for city in cities:
    if city['country'] == 'Norway' or city['country'] == 'France' or city['country'] == 'Turkey':
        lats.append(float(city['latitude']))
        temps.append(float(city['temperature']))
# Compute and show regression
plt.scatter(lats,temps)
a,b = np.polyfit(lats, temps, 1)
x1 = min(lats)
x2 = max(lats)
plt.plot([x1,x2], [a*x1 + b, a*x2 + b], color='red')
plt.xlim(x1,x2)
plt.show()
# Loop asking user for city name, compute predicted + actual temperature
while True:
    name = input('Enter city name (or "quit" to quit): ')
    if name == 'quit': break
    else:
        i=0
        while i < len(cities) and cities[i]['city'] != name: i += 1
        if i == len(cities):
            print('City not in dataset')
        else:
            print('Predicted temperature:', a * float(cities[i]['latitude']) + b)
            print('Actual temperature:', cities[i]['temperature'])

### <font color="green">Your Turn: World Cup Data</font>

In [None]:
# Read Players.csv into list of dictionaries
players = []
with open('Players.csv') as f:
    rows = csv.DictReader(f)
    for r in rows:
        players.append(r)

In [None]:
# From the players data, compute and plot a linear regression for
# minutes played versus passes made.
# Reminder: copy-paste-modify approach to programming!

In [None]:
# Show the correlation coefficient for the regression.
# Extra credit: Also show correlation coefficients for minutes played
# versus tackles, minutes played versus shots, and minutes played versus saves

In [None]:
# Use linear regression for number-of-passes predictor.
# Training data: compute minutes-passes regression for players from
# Greece, USA, and Portugal

In [None]:
# SUPER BONUS!!
# Repeat previous but use separate predictor for the four different positions
# (goalkeeper,defender,midfielder,forward). Does it do better?
# Try comparing correlation coefficients against one regression for all players.