In [51]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

def delete_whitespace(string: str) -> str:
    # Replace whitespace by underscore
    return string.replace(" ", "_")


def label_or_Dummies(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    # Case we want to predict just one observation
    if df.shape[0] < 5:
        return pd.get_dummies(data=df, columns=cols)
    else:
        le = LabelEncoder()
        cols_to_dummies = []
        for col in cols:
            if df[col].nunique() > 2:
                cols_to_dummies.append(col)
            else:
                df[col] = le.fit_transform(df[col])
    return pd.get_dummies(data=df, columns=cols_to_dummies)


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    new_cols = []
    # Rename columns
    for i in range(len(df.columns)):
        new_cols.append(delete_whitespace(df.columns[i]))

    df.columns = new_cols

    # Dummies variables and labelencoder
    cols = df.columns
    df = label_or_Dummies(df, cols)

    return df


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 1000)

In [74]:
df = pd.read_csv("Data/Transformed Data Set - Sheet1.csv")

In [75]:
df.drop(['Gender'],axis=1).head()

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink
0,Cool,Rock,Vodka,7UP/Sprite
1,Neutral,Hip hop,Vodka,Coca Cola/Pepsi
2,Warm,Rock,Wine,Coca Cola/Pepsi
3,Warm,Folk/Traditional,Whiskey,Fanta
4,Cool,Rock,Vodka,Coca Cola/Pepsi


In [76]:
df['Favorite Music Genre'].unique()

array(['Rock', 'Hip hop', 'Folk/Traditional', 'Jazz/Blues', 'Pop',
       'Electronic', 'R&B and soul'], dtype=object)

In [77]:
df['Favorite Beverage'].unique()

array(['Vodka', 'Wine', 'Whiskey', "Doesn't drink", 'Beer', 'Other'],
      dtype=object)

In [78]:
df['Favorite Soft Drink'].unique()

array(['7UP/Sprite', 'Coca Cola/Pepsi', 'Fanta', 'Other'], dtype=object)

In [54]:
df = preprocess(df)

In [55]:
df.head()

Unnamed: 0,Gender,Favorite_Color_Cool,Favorite_Color_Neutral,Favorite_Color_Warm,Favorite_Music_Genre_Electronic,Favorite_Music_Genre_Folk/Traditional,Favorite_Music_Genre_Hip hop,Favorite_Music_Genre_Jazz/Blues,Favorite_Music_Genre_Pop,Favorite_Music_Genre_R&B and soul,Favorite_Music_Genre_Rock,Favorite_Beverage_Beer,Favorite_Beverage_Doesn't drink,Favorite_Beverage_Other,Favorite_Beverage_Vodka,Favorite_Beverage_Whiskey,Favorite_Beverage_Wine,Favorite_Soft_Drink_7UP/Sprite,Favorite_Soft_Drink_Coca Cola/Pepsi,Favorite_Soft_Drink_Fanta,Favorite_Soft_Drink_Other
0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0
1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0
3,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0


In [56]:
X = df.drop(['Gender'],axis=1)
y = df.Gender

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,stratify=y,random_state=5)

In [57]:
from joblib import load

In [58]:
X_df = {"Favorite Color": "Cool", "Favorite Music Genre": "Rock", "Favorite Beverage": "Vodka", "Favorite Soft Drink": "7UP/Sprite"}

In [59]:
X_df = pd.DataFrame([X_df])

In [60]:
X_df

Unnamed: 0,Favorite Color,Favorite Music Genre,Favorite Beverage,Favorite Soft Drink
0,Cool,Rock,Vodka,7UP/Sprite


In [61]:
clf = load("Model/lr.joblib")
clf.fit(X,y)

LogisticRegression(C=100.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=98, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
c = preprocess(X_df)

In [63]:
c

Unnamed: 0,Favorite_Color_Cool,Favorite_Music_Genre_Rock,Favorite_Beverage_Vodka,Favorite_Soft_Drink_7UP/Sprite
0,1,1,1,1


In [64]:
X.head()

Unnamed: 0,Favorite_Color_Cool,Favorite_Color_Neutral,Favorite_Color_Warm,Favorite_Music_Genre_Electronic,Favorite_Music_Genre_Folk/Traditional,Favorite_Music_Genre_Hip hop,Favorite_Music_Genre_Jazz/Blues,Favorite_Music_Genre_Pop,Favorite_Music_Genre_R&B and soul,Favorite_Music_Genre_Rock,Favorite_Beverage_Beer,Favorite_Beverage_Doesn't drink,Favorite_Beverage_Other,Favorite_Beverage_Vodka,Favorite_Beverage_Whiskey,Favorite_Beverage_Wine,Favorite_Soft_Drink_7UP/Sprite,Favorite_Soft_Drink_Coca Cola/Pepsi,Favorite_Soft_Drink_Fanta,Favorite_Soft_Drink_Other
0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0
1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0
3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0


In [65]:
df_newdata = c.reindex(labels=X.columns,axis=1)

In [70]:
df_newdata= df_newdata.fillna(0)

In [72]:
clf.predict_proba(df_newdata)

array([[0.37984849, 0.62015151]])

In [81]:

import requests
url = 'http://127.0.0.1:8000/gender/predict_gender'
pred = requests.post(url,json={"Favorite Color": "Cool", "Favorite Music Genre": "Rock", "Favorite Beverage": "Vodka", "Favorite Soft Drink": "7UP/Sprite"})
print(pred.json())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)