In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv("Transformed Data Set - Sheet1.csv")
data.head(10)

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F
5,Warm,Jazz/Blues,Doesn't drink,Fanta,F
6,Cool,Pop,Beer,Coca Cola/Pepsi,F
7,Warm,Pop,Whiskey,Fanta,F
8,Warm,Rock,Other,7UP/Sprite,F
9,Neutral,Pop,Wine,Coca Cola/Pepsi,F


In [4]:
data.shape

(66, 5)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Favorite Color        66 non-null     object
 1   Favorite Music Genre  66 non-null     object
 2   Favorite Beverage     66 non-null     object
 3   Favorite Soft Drink   66 non-null     object
 4   Gender                66 non-null     object
dtypes: object(5)
memory usage: 2.7+ KB


In [6]:
for column in data.columns:
    print(data[column].value_counts())
    print()

Cool       37
Warm       22
Neutral     7
Name: Favorite Color, dtype: int64

Rock                19
Pop                 17
Hip hop              8
Electronic           8
R&B and soul         6
Jazz/Blues           4
Folk/Traditional     4
Name: Favorite Music Genre, dtype: int64

Doesn't drink    14
Beer             13
Other            11
Wine             10
Vodka             9
Whiskey           9
Name: Favorite Beverage, dtype: int64

Coca Cola/Pepsi    32
Fanta              14
7UP/Sprite         13
Other               7
Name: Favorite Soft Drink, dtype: int64

F    33
M    33
Name: Gender, dtype: int64



In [7]:
data.isna().sum()

Favorite Color          0
Favorite Music Genre    0
Favorite Beverage       0
Favorite Soft Drink     0
Gender                  0
dtype: int64

In [8]:
le = LabelEncoder()

for i in data.columns:
    data[i] = le.fit_transform(data[i])

data.head(10)

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,0,6,3,0,0
1,1,2,3,1,0
2,2,6,5,1,0
3,2,1,4,2,0
4,0,6,3,1,0
5,2,3,1,2,0
6,0,4,0,1,0
7,2,4,4,2,0
8,2,6,2,0,0
9,1,4,5,1,0


In [9]:
x = data.drop(["Gender"], axis=1)
y = data["Gender"]

x.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink
0,0,6,3,0
1,1,2,3,1
2,2,6,5,1
3,2,1,4,2
4,0,6,3,1


In [10]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42)

In [11]:
X_train.shape

(52, 4)

In [12]:
X_test.shape

(14, 4)

In [13]:
X_train.head(10)

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink
34,1,2,0,0
25,2,5,4,1
3,2,1,4,2
17,0,6,2,1
40,0,6,3,1
8,2,6,2,0
6,0,4,0,1
28,0,4,1,1
4,0,6,3,1
48,0,6,1,3


In [14]:
y_train.value_counts()

1    26
0    26
Name: Gender, dtype: int64

In [15]:
y_test.value_counts()

1    7
0    7
Name: Gender, dtype: int64

In [56]:
def classification_scores(model):
    
    acc_train = model.score(X_train,y_train)*100
    acc_test = model.score(X_test,y_test)*100
    
    return f"Train accuracy: {acc_train}", f"Test accuracy: {acc_test}"

## __Model__

In [57]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

print(classification_scores(knn))

('Train accuracy: 75.0', 'Test accuracy: 57.14285714285714')


In [60]:
# SVM

from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train,y_train)

print(classification_scores(svm))

('Train accuracy: 73.07692307692307', 'Test accuracy: 57.14285714285714')


In [59]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)

print(classification_scores(lr))

('Train accuracy: 55.769230769230774', 'Test accuracy: 64.28571428571429')
