# Simple Gender Classification

## Importing required dataset

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Getting dataset

In [2]:
df = pd.read_csv("data/proj95/gender.csv")
df.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (kg),Occupation,Education Level,Marital Status,Income (USD),Favorite Color,Unnamed: 9
0,male,32,175,70,Software Engineer,Master's Degree,Married,75000,Blue,
1,male,25,182,85,Sales Representative,Bachelor's Degree,Single,45000,Green,
2,female,41,160,62,Doctor,Doctorate Degree,Married,120000,Purple,
3,male,38,178,79,Lawyer,Bachelor's Degree,Single,90000,Red,
4,female,29,165,58,Graphic Designer,Associate's Degree,Single,35000,Yellow,


## Checking for nan values

In [3]:
df.isna().sum()

 Gender               0
 Age                  0
 Height (cm)          0
 Weight (kg)          0
 Occupation           0
 Education Level      0
 Marital Status       0
 Income (USD)         0
 Favorite Color       0
Unnamed: 9          131
dtype: int64

In [4]:
df.drop("Unnamed: 9", axis=1, inplace=True)

## Pre Processing dataset

In [5]:
df[" Gender"].unique()

array([' male', ' female', 'male', 'female'], dtype=object)

In [6]:
df[" Gender"].replace(" male", "male", inplace=True)
df[" Gender"].replace(" female", "female", inplace=True)

In [8]:
df[" Occupation"].unique()

array([' Software Engineer', ' Sales Representative', ' Doctor',
       ' Lawyer', ' Graphic Designer', ' Business Consultant',
       ' Marketing Specialist', ' CEO', ' Project Manager', ' Engineer',
       ' Accountant', ' Architect', ' Nurse', ' Analyst', ' Teacher',
       ' IT Manager', ' Writer', ' Business Analyst', 'Engineer',
       'Teacher', 'Doctor', 'Graphic Designer', 'IT Manager',
       'Sales Representative', 'Lawyer', 'Marketing Specialist',
       'Project Manager', 'Writer', 'Architect', 'Nurse',
       'Business Analyst', 'Accountant', 'CEO', 'Analyst',
       'Software Developer'], dtype=object)

In [12]:
df[" Occupation"] = df[" Occupation"].str.strip()

In [13]:
df[" Education Level"].unique()

array([" Master's Degree", " Bachelor's Degree", ' Doctorate Degree',
       " Associate's Degree", "Master's Degree", "Bachelor's Degree",
       'Doctorate Degree', "Associate's Degree"], dtype=object)

In [14]:
df[" Education Level"] = df[" Education Level"].str.strip()

In [15]:
df[" Marital Status"].unique()

array([' Married', ' Single', ' Divorced', ' Widowed', 'Single',
       'Married', 'Divorced'], dtype=object)

In [17]:
df[" Marital Status"] = df[" Marital Status"].str.strip()

In [18]:
df[" Favorite Color"].unique()

array([' Blue', ' Green', ' Purple', ' Red', ' Yellow', ' Black', ' Pink',
       ' Orange', ' Grey', 'Blue', 'Green', 'Red', 'Orange', 'Purple',
       'Yellow', 'Black', 'Grey', 'Pink'], dtype=object)

In [19]:
df[" Favorite Color"] = df[" Favorite Color"].str.strip()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0    Gender           131 non-null    object
 1    Age              131 non-null    int64 
 2    Height (cm)      131 non-null    int64 
 3    Weight (kg)      131 non-null    int64 
 4    Occupation       131 non-null    object
 5    Education Level  131 non-null    object
 6    Marital Status   131 non-null    object
 7    Income (USD)     131 non-null    int64 
 8    Favorite Color   131 non-null    object
dtypes: int64(4), object(5)
memory usage: 9.3+ KB


In [23]:
df.columns = df.columns.str.strip()

In [25]:
df.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (kg),Occupation,Education Level,Marital Status,Income (USD),Favorite Color
0,male,32,175,70,Software Engineer,Master's Degree,Married,75000,Blue
1,male,25,182,85,Sales Representative,Bachelor's Degree,Single,45000,Green
2,female,41,160,62,Doctor,Doctorate Degree,Married,120000,Purple
3,male,38,178,79,Lawyer,Bachelor's Degree,Single,90000,Red
4,female,29,165,58,Graphic Designer,Associate's Degree,Single,35000,Yellow


In [26]:
labels = {}

for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        labels[column] = le.classes_
    else:
        pass

In [27]:
labels

{'Gender': array(['female', 'male'], dtype=object),
 'Occupation': array(['Accountant', 'Analyst', 'Architect', 'Business Analyst',
        'Business Consultant', 'CEO', 'Doctor', 'Engineer',
        'Graphic Designer', 'IT Manager', 'Lawyer', 'Marketing Specialist',
        'Nurse', 'Project Manager', 'Sales Representative',
        'Software Developer', 'Software Engineer', 'Teacher', 'Writer'],
       dtype=object),
 'Education Level': array(["Associate's Degree", "Bachelor's Degree", 'Doctorate Degree',
        "Master's Degree"], dtype=object),
 'Marital Status': array(['Divorced', 'Married', 'Single', 'Widowed'], dtype=object),
 'Favorite Color': array(['Black', 'Blue', 'Green', 'Grey', 'Orange', 'Pink', 'Purple',
        'Red', 'Yellow'], dtype=object)}

In [28]:
df.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (kg),Occupation,Education Level,Marital Status,Income (USD),Favorite Color
0,1,32,175,70,16,3,1,75000,1
1,1,25,182,85,14,1,2,45000,2
2,0,41,160,62,6,2,1,120000,6
3,1,38,178,79,10,1,2,90000,7
4,0,29,165,58,8,0,2,35000,8


In [30]:
X = df.drop("Gender", axis=1).values
y = df["Gender"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
X_train.shape

(104, 8)

In [34]:
y_train.shape

(104,)

## Training the model

In [35]:
rclf = RandomForestClassifier()
rclf.fit(X_train, y_train)
rclf.score(X_test, y_test)

1.0

In [37]:
from xgboost import XGBClassifier

xclf = XGBClassifier()
xclf.fit(X_train, y_train)
xclf.score(X_test, y_test)

1.0

## Making Prediction

In [38]:
xclf.predict([[22, 182, 55, 16, 1, 200000, 7]])

array([1])

In [39]:
labels["Gender"][1]

'male'