# Assignment 5 - dataset

### Prepare the data

In [22]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

df = pd.read_csv('student-mat.csv') # predict the variable romantic (has a romantic interest)
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [23]:
# creating dummy variables of qualitative variables
dummies = pd.get_dummies(df['romantic']) # create dummies for variable 'romantic'
dummies.head(1) # shows the dummy variables

Unnamed: 0,no,yes
0,1,0


In [24]:
# adding dummies to the df by concatenating variables to dataframe
df = pd.concat([df, dummies], axis=1) # the axis=1 means: add it to the columns (axis=0 is rows)
df.head(1) # to see if the dummies are added

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,goout,Dalc,Walc,health,absences,G1,G2,G3,no,yes
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,1,1,3,6,5,6,6,1,0


### Create a subset

In [25]:
df_subset = df[["Walc", "goout", "famrel", "age", "health", "absences", "freetime"]] # select 7 variables
df_subset # shows the subset

Unnamed: 0,Walc,goout,famrel,age,health,absences,freetime
0,1,4,4,18,3,6,3
1,1,3,5,17,3,4,3
2,3,2,4,15,3,10,3
3,1,2,3,15,5,2,2
4,2,2,4,16,5,4,3
...,...,...,...,...,...,...,...
390,5,4,5,20,4,11,5
391,4,5,2,17,2,3,4
392,3,3,5,21,3,3,5
393,4,1,4,18,5,0,4


### Training

In [26]:
y = df['yes'] # Y is shares
X = df_subset # X is the other variables
X.index = df['school'] # shows the school
X.head() # Shows the X

Unnamed: 0_level_0,Walc,goout,famrel,age,health,absences,freetime
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GP,1,4,4,18,3,6,3
GP,1,3,5,17,3,4,3
GP,3,2,4,15,3,10,3
GP,1,2,3,15,5,2,2
GP,2,2,4,16,5,4,3


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # split the data, store it into different variables
X_train.head() # the train data

Unnamed: 0_level_0,Walc,goout,famrel,age,health,absences,freetime
school,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GP,3,3,4,17,4,0,4
GP,1,3,4,16,5,2,2
GP,2,3,3,15,1,2,2
GP,3,3,4,15,5,2,3
GP,1,3,5,20,5,0,5


In [28]:
lm = LinearRegression() # create the model
model = lm.fit(X_train, y_train) # train the model

In [29]:
y_test_p = lm.predict(X_test) # generate predictions for the test data
r2_score(y_test, y_test_p) # R2 prediction

0.035707405761282396

### KNN Algorithm

How it works: The algorithms bases the prediction on the three next neighbours of the case, taking the mean of the 3 neighbours.

In [30]:
knn = KNeighborsClassifier(n_neighbors=3) # create a KNN-classifier with 3 neighbors
knn = knn.fit(X_train, y_train) # this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) # calculate the fit on the test data

0.6302521008403361

In [31]:
y_test_pred = knn.predict(X_test) # the predicted values
cm = confusion_matrix(y_test, y_test_pred) # creates a "confusion matrix"
cm

array([[63, 18],
       [26, 12]])

In [32]:
conf_matrix = pd.DataFrame(cm, index=['no_romanticinterest', 'romanticinterest'], columns = ['no_romanticinterest_p', 'romanticinterest_p']) 
conf_matrix # shows the confusion matrix

Unnamed: 0,no_romanticinterest_p,romanticinterest_p
no_romanticinterest,63,18
romanticinterest,26,12


63 percent of the cases are predicted correctly as having no romantic interest, 18 percent is predicted wrong

In [33]:
63/(63+18) # recall

0.7777777777777778

In [34]:
63/(63+26) # precission

0.7078651685393258