# Getting Started

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('../input/food-preferences/Food_Preference.csv')

In [3]:
data

Unnamed: 0,Timestamp,Participant_ID,Gender,Nationality,Age,Food,Juice,Dessert
0,2019/05/07 2:59:13 PM GMT+8,FPS001,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,2019/05/07 2:59:45 PM GMT+8,FPS002,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,2019/05/07 3:00:05 PM GMT+8,FPS003,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,2019/05/07 3:00:11 PM GMT+8,FPS004,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,2019/05/07 3:02:50 PM GMT+8,FPS005,Male,Indian,27,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...,...,...
283,2019/05/10 9:24:00 AM GMT+8,FPS284,Male,Indian,27,Western Food,Fresh Juice,Yes
284,2019/05/10 9:32:54 AM GMT+8,FPS285,Male,Indian,24,Traditional food,Fresh Juice,Yes
285,2019/05/10 12:09:17 PM GMT+8,FPS286,Male,Indian,25,Traditional food,Fresh Juice,Yes
286,2019/05/10 12:52:17 PM GMT+8,FPS287,Male,Indian,27,Traditional food,Fresh Juice,Yes


In [4]:
data.drop(['Timestamp', 'Participant_ID'], axis=1, inplace=True)

In [5]:
data

Unnamed: 0,Gender,Nationality,Age,Food,Juice,Dessert
0,Male,Indian,24,Traditional food,Fresh Juice,Maybe
1,Female,Indian,22,Western Food,Carbonated drinks,Yes
2,Male,Indian,31,Western Food,Fresh Juice,Maybe
3,Female,Indian,25,Traditional food,Fresh Juice,Maybe
4,Male,Indian,27,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...
283,Male,Indian,27,Western Food,Fresh Juice,Yes
284,Male,Indian,24,Traditional food,Fresh Juice,Yes
285,Male,Indian,25,Traditional food,Fresh Juice,Yes
286,Male,Indian,27,Traditional food,Fresh Juice,Yes


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Gender       284 non-null    object
 1   Nationality  288 non-null    object
 2   Age          288 non-null    int64 
 3   Food         288 non-null    object
 4   Juice        288 non-null    object
 5   Dessert      288 non-null    object
dtypes: int64(1), object(5)
memory usage: 13.6+ KB


# Preprocessing

## Missing Values

In [7]:
data.dropna(axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)

In [8]:
data['Age']

0      24
1      22
2      31
3      25
4      27
       ..
279    27
280    24
281    25
282    27
283    27
Name: Age, Length: 284, dtype: int64

In [9]:
age_bins = pd.qcut(data['Age'], q=2, labels=[0, 1])

In [10]:
pd.concat([data['Age'], age_bins], axis=1)

Unnamed: 0,Age,Age.1
0,24,0
1,22,0
2,31,1
3,25,0
4,27,0
...,...,...
279,27,0
280,24,0
281,25,0
282,27,0


In [11]:
data['Age'] = age_bins

## Encoding

In [12]:
data

Unnamed: 0,Gender,Nationality,Age,Food,Juice,Dessert
0,Male,Indian,0,Traditional food,Fresh Juice,Maybe
1,Female,Indian,0,Western Food,Carbonated drinks,Yes
2,Male,Indian,1,Western Food,Fresh Juice,Maybe
3,Female,Indian,0,Traditional food,Fresh Juice,Maybe
4,Male,Indian,0,Traditional food,Fresh Juice,Maybe
...,...,...,...,...,...,...
279,Male,Indian,0,Western Food,Fresh Juice,Yes
280,Male,Indian,0,Traditional food,Fresh Juice,Yes
281,Male,Indian,0,Traditional food,Fresh Juice,Yes
282,Male,Indian,0,Traditional food,Fresh Juice,Yes


In [13]:
categorical_features = ['Gender', 'Nationality', 'Food', 'Juice', 'Dessert']

In [14]:
def get_uniques(df, columns):
    return {column: list(df[column].unique()) for column in columns}

In [15]:
get_uniques(data, categorical_features)

{'Gender': ['Male', 'Female'],
 'Nationality': ['Indian',
  'Pakistani ',
  'Tanzanian',
  'Indonesia',
  'Pakistan',
  'Maldivian ',
  'MY',
  'Malaysian',
  'Malaysian ',
  'Indonesian ',
  'Maldivian',
  'MALAYSIAN',
  'Malaysia ',
  'Pakistani',
  'Canadian',
  'Nigerian ',
  'Algerian ',
  'Korean ',
  'Seychellois',
  'Indonesain',
  'Indonesian',
  'Malaysia',
  'Japan',
  'China',
  'Mauritian',
  'Yemen'],
 'Food': ['Traditional food', 'Western Food'],
 'Juice': ['Fresh Juice', 'Carbonated drinks'],
 'Dessert': ['Maybe', 'Yes', 'No']}

In [16]:
binary_features = ['Gender', 'Food', 'Juice']

ordinal_features = ['Dessert']

nominal_features = ['Nationality']

In [17]:
def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df

In [18]:
def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [19]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column])
    df = pd.concat([df, dummies], axis=1)
    df.drop(column, axis=1, inplace=True)
    return df

In [20]:
data = binary_encode(data, 'Gender', 'Male')
data = binary_encode(data, 'Food', 'Traditional food')
data = binary_encode(data, 'Juice', 'Fresh Juice')

dessert_ordering = ['No', 'Maybe', 'Yes']
data = ordinal_encode(data, 'Dessert', dessert_ordering)

data = onehot_encode(data, 'Nationality')

In [21]:
data

Unnamed: 0,Gender,Age,Food,Juice,Dessert,Algerian,Canadian,China,Indian,Indonesain,...,Maldivian,Maldivian.1,Mauritian,Nigerian,Pakistan,Pakistani,Pakistani.1,Seychellois,Tanzanian,Yemen
0,1,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,1,0,0,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
280,1,0,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
281,1,0,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
282,1,0,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Scaling and Splitting

In [22]:
y = data['Age']
X = data.drop('Age', axis=1)

In [23]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# Training

In [25]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7209302325581395

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/r0eaUpurifA