In [179]:
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [180]:
data = pd.read_csv('/kaggle/input/gender-classification/Transformed Data Set - Sheet1.csv')

In [181]:
data.head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,Cool,Rock,Vodka,7UP/Sprite,F
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi,F
2,Warm,Rock,Wine,Coca Cola/Pepsi,F
3,Warm,Folk/Traditional,Whiskey,Fanta,F
4,Cool,Rock,Vodka,Coca Cola/Pepsi,F


# PreProcessing

In [182]:
encoder = LabelEncoder()
data['Favorite Color'] = encoder.fit_transform(data['Favorite Color'])

In [183]:
data['Gender'] = encoder.fit_transform(data['Gender'])
mapping_gender = {index : label  for index , label in enumerate(encoder.classes_)}

In [184]:
mapping_gender

{0: 'F', 1: 'M'}

In [185]:
data['Favorite Music Genre'].unique()

array(['Rock', 'Hip hop', 'Folk/Traditional', 'Jazz/Blues', 'Pop',
       'Electronic', 'R&B and soul'], dtype=object)

In [186]:
data['Favorite Beverage'].unique()

array(['Vodka', 'Wine', 'Whiskey', "Doesn't drink", 'Beer', 'Other'],
      dtype=object)

In [187]:
data['Favorite Soft Drink'].unique()

array(['7UP/Sprite', 'Coca Cola/Pepsi', 'Fanta', 'Other'], dtype=object)

In [188]:
def prefixes(df,columns,prefix):
    for col in columns:
        df[col] = df[col].apply(lambda x : prefix +  x )
    return df

In [189]:
prefixes(data,['Favorite Soft Drink'] , 's_')
prefixes(data,['Favorite Beverage'] , 's_')

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink,Gender
0,0,Rock,s_Vodka,s_7UP/Sprite,0
1,1,Hip hop,s_Vodka,s_Coca Cola/Pepsi,0
2,2,Rock,s_Wine,s_Coca Cola/Pepsi,0
3,2,Folk/Traditional,s_Whiskey,s_Fanta,0
4,0,Rock,s_Vodka,s_Coca Cola/Pepsi,0
...,...,...,...,...,...
61,0,Rock,s_Vodka,s_Coca Cola/Pepsi,1
62,0,Hip hop,s_Beer,s_Coca Cola/Pepsi,1
63,1,Hip hop,s_Doesn't drink,s_Fanta,1
64,0,Rock,s_Wine,s_Coca Cola/Pepsi,1


In [190]:
def dummieget(data,columns,inplace):
    for col in columns:
        dummie = (pd.get_dummies(data[col]))
        dummie = dummie.astype(np.int)
        data = pd.concat([data,dummie],axis=1)
        data.drop(col,axis=1,inplace=inplace)
    return data

In [191]:
data = dummieget(data,['Favorite Soft Drink','Favorite Beverage','Favorite Music Genre'],True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dummie = dummie.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dummie = dummie.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dummie = dummie.astype(np.int)


In [192]:
data

Unnamed: 0,Favorite Color,Gender,s_7UP/Sprite,s_Coca Cola/Pepsi,s_Fanta,s_Other,s_Beer,s_Doesn't drink,s_Other.1,s_Vodka,s_Whiskey,s_Wine,Electronic,Folk/Traditional,Hip hop,Jazz/Blues,Pop,R&B and soul,Rock
0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,2,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,2,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
62,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
63,1,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
64,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [193]:
y = data['Gender']
x = data.drop('Gender',axis=1)

In [194]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Favorite Color     66 non-null     int64
 1   s_7UP/Sprite       66 non-null     int64
 2   s_Coca Cola/Pepsi  66 non-null     int64
 3   s_Fanta            66 non-null     int64
 4   s_Other            66 non-null     int64
 5   s_Beer             66 non-null     int64
 6   s_Doesn't drink    66 non-null     int64
 7   s_Other            66 non-null     int64
 8   s_Vodka            66 non-null     int64
 9   s_Whiskey          66 non-null     int64
 10  s_Wine             66 non-null     int64
 11  Electronic         66 non-null     int64
 12  Folk/Traditional   66 non-null     int64
 13  Hip hop            66 non-null     int64
 14  Jazz/Blues         66 non-null     int64
 15  Pop                66 non-null     int64
 16  R&B and soul       66 non-null     int64
 17  Rock              

In [195]:
scaler = MinMaxScaler()

x = pd.DataFrame(scaler.fit_transform(x) , columns = x.columns )

In [196]:
x

Unnamed: 0,Favorite Color,s_7UP/Sprite,s_Coca Cola/Pepsi,s_Fanta,s_Other,s_Beer,s_Doesn't drink,s_Other.1,s_Vodka,s_Whiskey,s_Wine,Electronic,Folk/Traditional,Hip hop,Jazz/Blues,Pop,R&B and soul,Rock
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
62,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
63,0.5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
64,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Training

In [197]:
x_train , x_test , y_train , y_test = train_test_split(x,y,train_size=0.7)

In [198]:
model = LogisticRegression()
model.fit(x_train,y_train)
model.score(x_test,y_test)

0.6

In [199]:
model1 = SVC()
model1.fit(x_train,y_train)
model1.score(x_test,y_test)

0.55