# Twitter User Gender Classification

Please note the column **gender** will be the target feature in the dataset provided.
* Perform the required cleaning.
* Perform EDA to understand data better.
* Perform Feature selection/engineering/scaling (if required).
* Build a neural network using Sklearn and/or tensorflow.

In [1]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Activation

df = pd.read_csv('gender-classifier-DFE-791531.csv')

In [2]:
df.shape

(20050, 26)

In [3]:
df.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,gender,gender:confidence,profile_yn,profile_yn:confidence,created,...,profileimage,retweet_count,sidebar_color,text,tweet_coord,tweet_count,tweet_created,tweet_id,tweet_location,user_timezone
0,815719226,False,finalized,3,10/26/15 23:24,male,1.0,yes,1.0,12/5/13 1:48,...,https://pbs.twimg.com/profile_images/414342229...,0,FFFFFF,Robbie E Responds To Critics After Win Against...,,110964,10/26/15 12:40,6.5873e+17,main; @Kan1shk3,Chennai
1,815719227,False,finalized,3,10/26/15 23:30,male,1.0,yes,1.0,10/1/12 13:51,...,https://pbs.twimg.com/profile_images/539604221...,0,C0DEED,���It felt like they were my friends and I was...,,7471,10/26/15 12:40,6.5873e+17,,Eastern Time (US & Canada)
2,815719228,False,finalized,3,10/26/15 23:33,male,0.6625,yes,1.0,11/28/14 11:30,...,https://pbs.twimg.com/profile_images/657330418...,1,C0DEED,i absolutely adore when louis starts the songs...,,5617,10/26/15 12:40,6.5873e+17,clcncl,Belgrade
3,815719229,False,finalized,3,10/26/15 23:10,male,1.0,yes,1.0,6/11/09 22:39,...,https://pbs.twimg.com/profile_images/259703936...,0,C0DEED,Hi @JordanSpieth - Looking at the url - do you...,,1693,10/26/15 12:40,6.5873e+17,"Palo Alto, CA",Pacific Time (US & Canada)
4,815719230,False,finalized,3,10/27/15 1:15,female,1.0,yes,1.0,4/16/14 13:23,...,https://pbs.twimg.com/profile_images/564094871...,0,0,Watching Neighbours on Sky+ catching up with t...,,31462,10/26/15 12:40,6.5873e+17,,


**EDA and Cleaning**

In [4]:
df.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'gender', 'gender:confidence', 'profile_yn',
       'profile_yn:confidence', 'created', 'description', 'fav_number',
       'gender_gold', 'link_color', 'name', 'profile_yn_gold', 'profileimage',
       'retweet_count', 'sidebar_color', 'text', 'tweet_coord', 'tweet_count',
       'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  pr

In [6]:
#Discarding irrelevant data, unusable data and data with little non-null values
df = df.drop(['_unit_id','description','gender_gold','name','profile_yn_gold','profileimage','text','tweet_coord','tweet_id','tweet_location','user_timezone'], axis=1)

In [7]:
df['gender']=df['gender'].replace(['unknown','brand'], np.NaN)
df=df.drop(df[df['gender'].isna()].index).reset_index(drop=True)
df['gender'].value_counts()

female    6700
male      6194
Name: gender, dtype: int64

In [8]:
df = df.drop(df[df['_last_judgment_at'].isna()].index).reset_index(drop=True)
df['_last_judgment_at'] = pd.to_datetime(df['_last_judgment_at'])
df['created'] = pd.to_datetime(df['created'])
df['tweet_created'] = pd.to_datetime(df['tweet_created'])

df['last_judgment_year'] = df['_last_judgment_at'].apply(lambda x: x.year)
df['last_judgment_month'] = df['_last_judgment_at'].apply(lambda x: x.month)
df['last_judgment_day'] = df['_last_judgment_at'].apply(lambda x: x.day)
df['last_judgment_hour'] = df['_last_judgment_at'].apply(lambda x: x.hour)

df['created_year'] = df['created'].apply(lambda x: x.year)
df['created_month'] = df['created'].apply(lambda x: x.month)
df['created_day'] = df['created'].apply(lambda x: x.day)
df['created_hour'] = df['created'].apply(lambda x: x.hour)

df['tweet_year'] = df['tweet_created'].apply(lambda x: x.year)
df['tweet_month'] = df['tweet_created'].apply(lambda x: x.month)
df['tweet_day'] = df['tweet_created'].apply(lambda x: x.day)
df['tweet_hour'] = df['tweet_created'].apply(lambda x: x.hour)

df = df.drop(['_last_judgment_at','created','tweet_created'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12858 entries, 0 to 12857
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _golden                12858 non-null  bool   
 1   _unit_state            12858 non-null  object 
 2   _trusted_judgments     12858 non-null  int64  
 3   gender                 12858 non-null  object 
 4   gender:confidence      12858 non-null  float64
 5   profile_yn             12858 non-null  object 
 6   profile_yn:confidence  12858 non-null  float64
 7   fav_number             12858 non-null  int64  
 8   link_color             12858 non-null  object 
 9   retweet_count          12858 non-null  int64  
 10  sidebar_color          12858 non-null  object 
 11  tweet_count            12858 non-null  int64  
 12  last_judgment_year     12858 non-null  int64  
 13  last_judgment_month    12858 non-null  int64  
 14  last_judgment_day      12858 non-null  int64  
 15  la

In [9]:
#get color in hex and return RGB values in decimal
def hextodec(value):
    try:
        return int(value, 16)
    except:
        return 0

def getRGB(color):
    r=color.apply(lambda x: hextodec(x[0:2]))
    g=color.apply(lambda x: hextodec(x[2:4]))
    b=color.apply(lambda x: hextodec(x[4:6]))
    return r, g, b

df['link_red'], df['link_green'], df['link_blue'] = getRGB(df['link_color'])
df['sidebar_red'], df['sidebar_green'], df['sidebar_blue'] = getRGB(df['sidebar_color'])
df = df.drop(['link_color','sidebar_color'], axis=1).reset_index(drop=True)
df.columns

Index(['_golden', '_unit_state', '_trusted_judgments', 'gender',
       'gender:confidence', 'profile_yn', 'profile_yn:confidence',
       'fav_number', 'retweet_count', 'tweet_count', 'last_judgment_year',
       'last_judgment_month', 'last_judgment_day', 'last_judgment_hour',
       'created_year', 'created_month', 'created_day', 'created_hour',
       'tweet_year', 'tweet_month', 'tweet_day', 'tweet_hour', 'link_red',
       'link_green', 'link_blue', 'sidebar_red', 'sidebar_green',
       'sidebar_blue'],
      dtype='object')

In [10]:
#Remove features with only one value
for i in df:
    if(len(df[i].unique())==1):
        print(df[i].value_counts())
        df=df.drop([i], axis=1)

False    12858
Name: _golden, dtype: int64
finalized    12858
Name: _unit_state, dtype: int64
3    12858
Name: _trusted_judgments, dtype: int64
yes    12858
Name: profile_yn, dtype: int64
2015    12858
Name: last_judgment_year, dtype: int64
10    12858
Name: last_judgment_month, dtype: int64
2015    12858
Name: tweet_year, dtype: int64
10    12858
Name: tweet_month, dtype: int64
26    12858
Name: tweet_day, dtype: int64


In [11]:
label_map = {'female': 0, 'male': 1}
df['gender'] = df['gender'].replace(label_map)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12858 entries, 0 to 12857
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 12858 non-null  int64  
 1   gender:confidence      12858 non-null  float64
 2   profile_yn:confidence  12858 non-null  float64
 3   fav_number             12858 non-null  int64  
 4   retweet_count          12858 non-null  int64  
 5   tweet_count            12858 non-null  int64  
 6   last_judgment_day      12858 non-null  int64  
 7   last_judgment_hour     12858 non-null  int64  
 8   created_year           12858 non-null  int64  
 9   created_month          12858 non-null  int64  
 10  created_day            12858 non-null  int64  
 11  created_hour           12858 non-null  int64  
 12  tweet_hour             12858 non-null  int64  
 13  link_red               12858 non-null  int64  
 14  link_green             12858 non-null  int64  
 15  li

In [12]:
df.columns

Index(['gender', 'gender:confidence', 'profile_yn:confidence', 'fav_number',
       'retweet_count', 'tweet_count', 'last_judgment_day',
       'last_judgment_hour', 'created_year', 'created_month', 'created_day',
       'created_hour', 'tweet_hour', 'link_red', 'link_green', 'link_blue',
       'sidebar_red', 'sidebar_green', 'sidebar_blue'],
      dtype='object')

**Arrange and normalize features**

In [13]:
X = df[['gender:confidence', 'profile_yn:confidence', 'fav_number',
       'retweet_count', 'tweet_count', 'last_judgment_day',
       'last_judgment_hour', 'created_year', 'created_month', 'created_day',
       'created_hour', 'tweet_hour', 'link_red', 'link_green', 'link_blue',
       'sidebar_red', 'sidebar_green', 'sidebar_blue']]
Y = df[['gender']]

In [14]:
normalized_X = preprocessing.StandardScaler().fit_transform(X)

In [15]:
X_bal, Y_bal = RandomUnderSampler().fit_resample(normalized_X, Y)

In [16]:
Y_bal['gender'].value_counts()

0    6173
1    6173
Name: gender, dtype: int64

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X_bal, Y_bal, train_size=0.75)

## Scikit-Learn NN model
* MultiLayer Perceptron
* Activation function: ReLU (Default)
* Solver : adam (Default)

In [18]:
mlp = MLPClassifier(hidden_layer_sizes=(25,7), max_iter=400)

In [19]:
mlp.fit(X_train, Y_train.values.ravel())

MLPClassifier(hidden_layer_sizes=(25, 7), max_iter=400)

In [20]:
Y_pred_nn = mlp.predict(X_test)
accuracy_score(Y_test, Y_pred_nn)

0.6122448979591837

## Tensorflow model
* Activation function : ReLu for hidden layers, Softmax for Output
* Loss function: Binary Cross Entropy
* Optimizer : adam

In [21]:
model = keras.Sequential()
model.add(Input(shape=(18,)),)
model.add(Dense(22, activation='relu'),)
model.add(Dense(28, activation='relu'),)
model.add(Dense(18, activation='relu'),)
model.add(Dense(2, activation='softmax'),)

In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy',)

In [23]:
model.fit(X_train, Y_train, epochs=30, verbose=False)

<tensorflow.python.keras.callbacks.History at 0x27f0a6df580>

In [24]:
model.evaluate(X_test, Y_test, verbose=False)

0.6931466460227966