In [1]:
%matplotlib inline

import pandas as pd
import seaborn as sbn
import numpy as np
sbn.set()
import matplotlib as plt
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 


In [2]:
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

In [3]:
# Read in the CSV
BI = pd.DataFrame.from_csv('../../Data/body_image.csv', index_col=None, encoding = "ISO-8859-1")

In [4]:
BI.head(10)

Unnamed: 0,Gender,Height,GPA,HS_GPA,Seat,WtFeel,Cheat
0,Female,64,2.6,2.63,M,AboutRt,No
1,Male,69,2.7,3.72,M,AboutRt,No
2,Female,66,3.0,3.44,F,AboutRt,No
3,Female,63,3.11,2.73,F,AboutRt,No
4,Male,72,3.4,2.35,B,OverWt,No
5,Female,67,3.43,3.84,M,AboutRt,No
6,Male,69,3.7,4.0,F,AboutRt,No
7,Male,74,3.7,3.92,B,AboutRt,No
8,Male,72,3.77,3.09,M,UnderWt,No
9,Female,63,3.5,4.0,F,AboutRt,No


In [5]:
seats = []

for idx, row in BI.iterrows():
    if row["Seat"] == "B":
        seats.append(0)
    elif row["Seat"] == "M":
        seats.append(.5)
    else:
        seats.append(1)
        
BI["SeatNum"] = seats

In [6]:
wtFeels = []

for idx, row in BI.iterrows():
    if row["WtFeel"] == "AboutRt":
        wtFeels.append(0)
    elif row["WtFeel"] == "OverWt":
        wtFeels.append(1)
    else:
        wtFeels.append(-1)
        
BI["WtFeelNum"] = wtFeels

In [7]:
genders = []

for idx, row in BI.iterrows():
    if row["Gender"] == "Male":
        genders.append(1)
    else:
        genders.append(0)
        
BI["GenderNum"] = genders

In [8]:
cheats = []

for idx, row in BI.iterrows():
    if row["Cheat"] == "Yes":
        cheats.append(1)
    else:
        cheats.append(0)
        
BI["CheatNum"] = cheats

In [9]:
BI.head(20)

Unnamed: 0,Gender,Height,GPA,HS_GPA,Seat,WtFeel,Cheat,SeatNum,WtFeelNum,GenderNum,CheatNum
0,Female,64.0,2.6,2.63,M,AboutRt,No,0.5,0,0,0
1,Male,69.0,2.7,3.72,M,AboutRt,No,0.5,0,1,0
2,Female,66.0,3.0,3.44,F,AboutRt,No,1.0,0,0,0
3,Female,63.0,3.11,2.73,F,AboutRt,No,1.0,0,0,0
4,Male,72.0,3.4,2.35,B,OverWt,No,0.0,1,1,0
5,Female,67.0,3.43,3.84,M,AboutRt,No,0.5,0,0,0
6,Male,69.0,3.7,4.0,F,AboutRt,No,1.0,0,1,0
7,Male,74.0,3.7,3.92,B,AboutRt,No,0.0,0,1,0
8,Male,72.0,3.77,3.09,M,UnderWt,No,0.5,-1,1,0
9,Female,63.0,3.5,4.0,F,AboutRt,No,1.0,0,0,0


# Predicting if a peron will cheat
## Based on:
* Gender
* Height
* College GPA
* HS GPA
* Where they sit in class
* How they feel about their weight

# SVC

In [10]:
data = BI.as_matrix(columns=["GenderNum","Height","GPA","HS_GPA","SeatNum","WtFeelNum"])
target = BI.as_matrix(columns=["CheatNum"])


In [11]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

In [12]:
cls = SVC()
cls.fit(Xtrain, ytrain.ravel())

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [13]:
result = cls.predict([[1,71,3.8,3.67,1,1]])
print(result)

[0]


In [14]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.93      1.00      0.96        64
          1       0.00      0.00      0.00         5

avg / total       0.86      0.93      0.89        69



  'precision', 'predicted', average, warn_for)


# Perceptron

In [15]:
data = BI.as_matrix(columns=["GenderNum","Height","GPA","HS_GPA","SeatNum","WtFeelNum"])
target = BI.as_matrix(columns=["CheatNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = Perceptron()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[1,71,3.8,3.67,1,1]])
print(result)

[0]


In [16]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.93      1.00      0.96        64
          1       0.00      0.00      0.00         5

avg / total       0.86      0.93      0.89        69



  'precision', 'predicted', average, warn_for)


# Linear Regression

In [17]:
data = BI.as_matrix(columns=["GenderNum","Height","GPA","HS_GPA","SeatNum","WtFeelNum"])
target = BI.as_matrix(columns=["CheatNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = LinearRegression()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[1,71,3.8,3.67,1,1]])
print(result)

[ 0.16851186]


In [18]:
print (cls.score(Xtest,ytest))

-0.0652578058839


In [19]:
print(cls.coef_,cls.intercept_)

[-0.09949252  0.00965749  0.07577529 -0.05508191  0.15681352  0.02044626] -0.680733021795


# Decision Tree

In [20]:
data = BI.as_matrix(columns=["GenderNum","Height","GPA","HS_GPA","SeatNum","WtFeelNum"])
target = BI.as_matrix(columns=["CheatNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = DecisionTreeClassifier()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[1,71,3.8,3.67,1,1]])
print(result)

[1]


In [21]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.95      0.88      0.91        65
          1       0.11      0.25      0.15         4

avg / total       0.90      0.84      0.87        69



# Random Forest

In [22]:
data = BI.as_matrix(columns=["GenderNum","Height","GPA","HS_GPA","SeatNum","WtFeelNum"])
target = BI.as_matrix(columns=["CheatNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = RandomForestClassifier()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[1,71,3.8,3.67,1,1]])
print(result)

[0]


In [23]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.94      1.00      0.97        65
          1       0.00      0.00      0.00         4

avg / total       0.89      0.94      0.91        69



  'precision', 'predicted', average, warn_for)


# Predicting male or female
## Based on:
* College GPA
* HS GPA
* Where they sit in class

# SVC

In [24]:
data = BI.as_matrix(columns=["GPA","HS_GPA","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = SVC()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 3.6, 1]])
print(result)

[0]


In [25]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.73      0.94      0.82        50
          1       0.40      0.11      0.17        19

avg / total       0.64      0.71      0.64        69



# Perceptron

In [26]:
data = BI.as_matrix(columns=["GPA","HS_GPA","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = Perceptron()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 3.6, 1]])
print(result)

[1]


In [27]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        46
          1       0.33      1.00      0.50        23

avg / total       0.11      0.33      0.17        69



  'precision', 'predicted', average, warn_for)


# Linear Regression

In [28]:
data = BI.as_matrix(columns=["GPA","HS_GPA","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = LinearRegression()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 3.6, 1]])
print(result)

[ 0.22764269]


In [29]:
print (cls.score(Xtest,ytest))
print(cls.coef_,cls.intercept_)

0.0412544524676
[-0.04737569 -0.01949515 -0.19589184] 0.673744660627


# Decison Tree

In [30]:
data = BI.as_matrix(columns=["GPA","HS_GPA","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = DecisionTreeClassifier()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 3.6, 1]])
print(result)

[0]


In [31]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.69      0.72      0.70        46
          1       0.38      0.35      0.36        23

avg / total       0.59      0.59      0.59        69



# Random Forest

In [32]:
data = BI.as_matrix(columns=["GPA","HS_GPA","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = RandomForestClassifier()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 3.6, 1]])
print(result)

[0]


In [33]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.60      0.73      0.66        44
          1       0.25      0.16      0.20        25

avg / total       0.48      0.52      0.49        69



# Predicting male or female
## Based on:
* College GPA
* How they feel about their weight
* Where they sit in class

# SVC

In [34]:
data = BI.as_matrix(columns=["GPA","WtFeelNum","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = SVC()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 0, 1]])
print(result)

[0]


In [35]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.67      0.95      0.79        44
          1       0.67      0.16      0.26        25

avg / total       0.67      0.67      0.59        69



# Perceptron

In [36]:
data = BI.as_matrix(columns=["GPA","WtFeelNum","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = Perceptron()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 0, 1]])
print(result)

[0]


In [37]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.70      0.38      0.49        42
          1       0.43      0.74      0.55        27

avg / total       0.59      0.52      0.51        69



# Linear Regression

In [38]:
data = BI.as_matrix(columns=["GPA","WtFeelNum","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = LinearRegression()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 0, 1]])
print(result)

[ 0.29773688]


In [39]:
print (cls.score(Xtest,ytest))
print(cls.coef_,cls.intercept_)

0.0640663288101
[-0.09579904 -0.09570782 -0.09797049] 0.759743725535


# Decision Tree

In [40]:
data = BI.as_matrix(columns=["GPA","WtFeelNum","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = DecisionTreeClassifier()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 0, 1]])
print(result)

[0]


In [41]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.61      0.83      0.70        41
          1       0.46      0.21      0.29        28

avg / total       0.55      0.58      0.54        69



# Random Forest

In [42]:
data = BI.as_matrix(columns=["GPA","WtFeelNum","SeatNum"])
target = BI.as_matrix(columns=["GenderNum"])

Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size = 0.3)

cls = RandomForestClassifier()
cls.fit(Xtrain, ytrain.ravel())

result = cls.predict([[3.8, 0, 1]])
print(result)

[1]


In [43]:
ypred = cls.predict(Xtest)
print(classification_report(ytest, ypred))

             precision    recall  f1-score   support

          0       0.67      0.67      0.67        43
          1       0.46      0.46      0.46        26

avg / total       0.59      0.59      0.59        69



In [44]:
BI.head(20)

Unnamed: 0,Gender,Height,GPA,HS_GPA,Seat,WtFeel,Cheat,SeatNum,WtFeelNum,GenderNum,CheatNum
0,Female,64.0,2.6,2.63,M,AboutRt,No,0.5,0,0,0
1,Male,69.0,2.7,3.72,M,AboutRt,No,0.5,0,1,0
2,Female,66.0,3.0,3.44,F,AboutRt,No,1.0,0,0,0
3,Female,63.0,3.11,2.73,F,AboutRt,No,1.0,0,0,0
4,Male,72.0,3.4,2.35,B,OverWt,No,0.0,1,1,0
5,Female,67.0,3.43,3.84,M,AboutRt,No,0.5,0,0,0
6,Male,69.0,3.7,4.0,F,AboutRt,No,1.0,0,1,0
7,Male,74.0,3.7,3.92,B,AboutRt,No,0.0,0,1,0
8,Male,72.0,3.77,3.09,M,UnderWt,No,0.5,-1,1,0
9,Female,63.0,3.5,4.0,F,AboutRt,No,1.0,0,0,0
