In [None]:
!python --version   # Python version

# About python:  https://www.python.org/
#                Python is powerful... and fast; plays well with others; runs everywhere; is friendly & easy to learn;
#                is Open –> https://www.python.org/about/.
#     Python docs: https://docs.python.org/3/ (all documentation);
#                  https://docs.python.org/3.10/ (Recommended version – 3.10).
# The Python Tutorial (python3.10): https://docs.python.org/3.10//tutorial/index.html

# Load Modules ---
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
# NumPy : The fundamental package for scientific computing with Python. NumPy is the fundamental package for scientific
#         computing in Python. It is a Python library that provides a multidimensional array object, various derived
#         objects (such as masked arrays and matrices), and an assortment of routines for fast operations on arrays,
#         including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms,
#         basic linear algebra, basic statistical operations, random simulation and much more.
#     About: https://numpy.org/
#     Docs: https://numpy.org/doc/stable/
#     NumPy quickstart: https://numpy.org/doc/stable/user/quickstart.html

# Pandas: pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
#         built on top of the Python programming language.
#     About: https://pandas.pydata.org/
#     Docs: https://pandas.pydata.org/docs/
#     Getting started: https://pandas.pydata.org/docs/getting_started/index.html
#     User Guide: https://pandas.pydata.org/docs/user_guide/index.html#user-guide

# Matplotlib : Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations.
#       About: https://matplotlib.org/
#       Using Matplotlib (user guide): https://matplotlib.org/stable/users/index
#       Plot types: https://matplotlib.org/stable/plot_types/index
#       Tutorials: https://matplotlib.org/stable/tutorials/index
#       Examples: https://matplotlib.org/stable/gallery/index
#       API Reference: https://matplotlib.org/stable/api/index

# Seaborn: Seaborn is a Python data visualization library based on matplotlib. It provides a
#          high-level interface for drawing attractive and informative statistical graphics.
#   About: https://seaborn.pydata.org/

print('numpy version:',np.__version__)
print('pandas version: ',pd.__version__)
print('seaborn version:',sns.__version__)
print('pyplot: ',plt)

Python 3.10.12
numpy version: 1.23.5
pandas version:  1.5.3
seaborn version: 0.12.2
pyplot:  <module 'matplotlib.pyplot' from '/usr/local/lib/python3.10/dist-packages/matplotlib/pyplot.py'>


# Load Dataset - Fake Profile Detection

source: https://drive.google.com/file/d/1OSB5ErGwKfnKn3tNa2g4T3axPzCulj4N/view?usp=drive_link

### download dataset

In [None]:
# download dataset (csv file)
!gdown "1OSB5ErGwKfnKn3tNa2g4T3axPzCulj4N"                                      # download by file id

Downloading...
From: https://drive.google.com/uc?id=1OSB5ErGwKfnKn3tNa2g4T3axPzCulj4N
To: /content/fake_profile_detection_data.csv
  0% 0.00/1.65M [00:00<?, ?B/s]100% 1.65M/1.65M [00:00<00:00, 117MB/s]


### load dataset

In [None]:
data_df=pd.read_csv('fake_profile_detection_data.csv')                          # load dataset
data_df.head()                                                                  # see head of the data

Unnamed: 0,name,screen_name,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,url,...,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,protected,verified,description,updated,dataset,status
0,Davide Dellacasa,braddd,0,20370,5470,2385,145,52,Fri Apr 06 10:58:22 +0000 2007,http://braddd.tumblr.com,...,http://a0.twimg.com/profile_background_images/...,BADFCD,FF0000,3600.0,Public,VERIFIED,Founder of http://www.screenweek.it & http://w...,2/14/2015 10:54,E13,1
1,Simone Economo,eKoeS,68,3131,506,381,9,40,Mon Apr 30 15:08:42 +0000 2007,http://www.lineheight.net/,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,3600.0,Public,VERIFIED,BSc degree (cum laude) in Computer Engineering...,2/14/2015 10:54,E13,1
2,tacone,tacone_,7696,4024,264,87,323,16,Tue May 01 11:53:40 +0000 2007,http://t.co/LKrl1dZE,...,http://a0.twimg.com/profile_background_images/...,1A1B1F,2FC2EF,3600.0,Public,VERIFIED,Cogito ergo bestemmio.,2/14/2015 10:54,E13,1
3,alesaura,alesstar,202,40586,640,622,1118,32,Tue May 15 16:55:16 +0000 2007,http://alesstar.wordpress.com/,...,http://a0.twimg.com/images/themes/theme4/bg.gif,0099B9,0099B9,3600.0,Public,VERIFIED,"Se la vita ti dà sarde, scapocciale!",2/14/2015 10:54,E13,1
4,Angelo,PerDiletto,37318,2016,62,64,13,0,Sun May 13 19:52:00 +0000 2007,http://www.flickr.com/per_diletto,...,http://a0.twimg.com/images/themes/theme18/bg.gif,ACDED6,38543,3600.0,Public,VERIFIED,Je me souviens,2/14/2015 10:54,E13,1


In [None]:
data_df.tail()                                                                  # see tail of dataset

Unnamed: 0,name,screen_name,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,created_at,url,...,profile_background_image_url,profile_background_color,profile_link_color,utc_offset,protected,verified,description,updated,dataset,status
2813,Verda Marks,VerdaMarks1,523,1,0,17,0,0,Tue Apr 30 08:23:57 +0000 2013,,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,,Public,UNVERIFIED,I have been in business onlin and offline for ...,2/14/2015 10:40,INT,0
2814,Danial Campbell,DanialCampbell2,0,0,1,17,0,0,Tue Apr 30 08:34:49 +0000 2013,,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,,Public,UNVERIFIED,,2/14/2015 10:40,INT,0
2815,Maudie Meyer,MaudieMeyer1,327,2,0,15,0,0,Tue Apr 30 09:21:12 +0000 2013,,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,,Public,UNVERIFIED,,2/14/2015 10:40,INT,0
2816,Harriett Harvey,HarriettHarvey9,251,2,0,16,0,0,Tue Apr 30 11:25:11 +0000 2013,,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,,Public,UNVERIFIED,,2/14/2015 10:40,INT,0
2817,Gillian Wheeler,GillianWheeler3,35222,0,0,17,0,0,Tue Apr 30 12:47:51 +0000 2013,,...,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,,Public,UNVERIFIED,Hello!im a BELIEBER! IM A HUGE FAN OF JUSTIN B...,2/14/2015 10:40,INT,0


In [None]:
data_df.shape                                                                   # (rows, columns) shape of data

(2818, 35)

In [None]:
data_df.size                                                                    # total elements (values) in data

98630

In [None]:
data_df.index                                                                   # rows index

RangeIndex(start=0, stop=2818, step=1)

In [None]:
data_df.columns                                                                 # column names

Index(['name', 'screen_name', 'fav_number', 'statuses_count',
       'followers_count', 'friends_count', 'favourites_count', 'listed_count',
       'created_at', 'url', 'lang', 'time_zone', 'location', 'default_profile',
       'default_profile_image', 'geo_enabled', 'profile_image_url',
       'profile_banner_url', 'profile_use_background_image',
       'profile_background_image_url_https', 'profile_text_color',
       'profile_image_url_https', 'profile_sidebar_border_color',
       'profile_background_tile', 'profile_sidebar_fill_color',
       'profile_background_image_url', 'profile_background_color',
       'profile_link_color', 'utc_offset', 'protected', 'verified',
       'description', 'updated', 'dataset', 'status'],
      dtype='object')

In [None]:
data_df.info()                                                                  # get info of dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2818 entries, 0 to 2817
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   name                                2818 non-null   object 
 1   screen_name                         2818 non-null   object 
 2   fav_number                          2818 non-null   int64  
 3   statuses_count                      2818 non-null   int64  
 4   followers_count                     2818 non-null   int64  
 5   friends_count                       2818 non-null   int64  
 6   favourites_count                    2818 non-null   int64  
 7   listed_count                        2818 non-null   int64  
 8   created_at                          2818 non-null   object 
 9   url                                 463 non-null    object 
 10  lang                                2818 non-null   object 
 11  time_zone                           1069 no

### see stats of data

In [None]:
data_df.dtypes                                                                  # data type of columns

name                                   object
screen_name                            object
fav_number                              int64
statuses_count                          int64
followers_count                         int64
friends_count                           int64
favourites_count                        int64
listed_count                            int64
created_at                             object
url                                    object
lang                                   object
time_zone                              object
location                               object
default_profile                       float64
default_profile_image                 float64
geo_enabled                           float64
profile_image_url                      object
profile_banner_url                     object
profile_use_background_image          float64
profile_background_image_url_https     object
profile_text_color                     object
profile_image_url_https           

**see stats of numerical columns**

In [None]:
data_df.describe()                                                              # see stats of numerical columns

Unnamed: 0,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,default_profile,default_profile_image,geo_enabled,profile_use_background_image,profile_background_tile,utc_offset,status
count,2818.0,2818.0,2818.0,2818.0,2818.0,2818.0,1728.0,8.0,721.0,2760.0,489.0,1069.0,2818.0
mean,4605.135912,1672.198368,371.105039,395.363023,234.541164,2.818666,1.0,1.0,1.0,1.0,1.0,1478.39102,0.52555
std,12715.619375,4884.669157,8022.631339,465.694322,1445.847248,23.48043,0.0,0.0,0.0,0.0,0.0,8108.211889,0.499435
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,-39600.0,0.0
25%,29.25,35.0,17.0,168.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3600.0,0.0
50%,529.5,77.0,26.0,306.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3600.0,1.0
75%,3617.5,1087.75,111.0,519.0,37.0,1.0,1.0,1.0,1.0,1.0,1.0,3600.0,1.0
max,219586.0,79876.0,408372.0,12773.0,44349.0,744.0,1.0,1.0,1.0,1.0,1.0,36000.0,1.0


In [None]:
# see standard deviation of all columns - column having zero standard devation have no uses as all values are same in column.
# Standard devation zero means all values are same in column
data_df.describe().loc['std']                                                   # all columns have standard devation zero are of no use / importance

fav_number                      12715.619375
statuses_count                   4884.669157
followers_count                  8022.631339
friends_count                     465.694322
favourites_count                 1445.847248
listed_count                       23.480430
default_profile                     0.000000
default_profile_image               0.000000
geo_enabled                         0.000000
profile_use_background_image        0.000000
profile_background_tile             0.000000
utc_offset                       8108.211889
status                              0.499435
Name: std, dtype: float64

In [None]:
data_df.describe().loc['std']==0                                                # columns where standard devation is zero

fav_number                      False
statuses_count                  False
followers_count                 False
friends_count                   False
favourites_count                False
listed_count                    False
default_profile                  True
default_profile_image            True
geo_enabled                      True
profile_use_background_image     True
profile_background_tile          True
utc_offset                      False
status                          False
Name: std, dtype: bool

In [None]:
#data_df.describe().columns                                                     # get all columns name from dataframe description
# get all columns names where standard devation is zero
columns_std0=data_df.describe().columns[data_df.describe().loc['std']==0]       # column names where std = 0
data_df[columns_std0].head()                                                    # see columns head with zero standard devation                                                           # see column names

Unnamed: 0,default_profile,default_profile_image,geo_enabled,profile_use_background_image,profile_background_tile
0,,,,1.0,
1,,,,1.0,
2,,,,1.0,1.0
3,,,1.0,1.0,
4,,,1.0,1.0,


In [None]:
data_df[columns_std0].tail()                                                    # see columns tail with zero standard devation

Unnamed: 0,default_profile,default_profile_image,geo_enabled,profile_use_background_image,profile_background_tile
2813,1.0,,,1.0,
2814,1.0,,,1.0,
2815,1.0,,,1.0,
2816,1.0,,,1.0,
2817,1.0,,,1.0,


In [None]:
# make copy of dataset as cleaning have to be done in data, so get revert anytime make a copy
data_df_copy=data_df.copy()                                                     # make a copy of dataset

In [None]:
# as most of the columns with zero standard devation have single value or null value to we drop out all columns with zero standard devation
data_df_copy.drop(columns=columns_std0,inplace=True)                            # drop columns with zero standard devation
data_df_copy.describe().loc['std']                                              # check again standard devation for numerical columns

fav_number          12715.619375
statuses_count       4884.669157
followers_count      8022.631339
friends_count         465.694322
favourites_count     1445.847248
listed_count           23.480430
utc_offset           8108.211889
status                  0.499435
Name: std, dtype: float64

**see stats of numerical columns**

In [None]:
data_df_copy.describe(include='object')                                         # see description for object type columns

Unnamed: 0,name,screen_name,created_at,url,lang,time_zone,location,profile_image_url,profile_banner_url,profile_background_image_url_https,...,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,protected,verified,description,updated,dataset
count,2818,2818,2818,463,2818,1069,2271,2818,987,2818,...,2818,2818,2818,2818,2818,2818,2818,2547,2818,2818
unique,2811,2818,2767,462,8,33,1678,2815,987,770,...,128,179,759,273,365,3,2,2534,2,2
top,Giulia,braddd,Sat Jun 23 15:33:05 +0000 2012,http://twitter.com,en,Rome,Roma,http://a0.twimg.com/sticky/default_profile_ima...,https://si0.twimg.com/profile_banners/3610511/...,https://si0.twimg.com/images/themes/theme1/bg.png,...,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,nl,2/14/2015 10:54,E13
freq,3,1,5,2,1502,581,47,3,1,1729,...,1830,1990,1755,1876,1955,1961,1884,7,1481,1481


In [None]:
# as pandas.DataFrame.describe() -> returns pandas.DataFrame only get that dataframe and store in variable
data_df_object_description=data_df_copy.describe(include='object')              # get description for object type columns
# get unique value percentage count (how much unique values are there in column)
unique_value_percentage=(data_df_object_description.loc['count']-data_df_object_description.loc['unique'])/data_df_object_description.loc['count']
unique_value_percentage                                                         # see unique value percentage count

name                                  0.002484
screen_name                                0.0
created_at                            0.018098
url                                    0.00216
lang                                  0.997161
time_zone                              0.96913
location                              0.261118
profile_image_url                     0.001065
profile_banner_url                         0.0
profile_background_image_url_https    0.726757
profile_text_color                    0.936125
profile_image_url_https               0.001065
profile_sidebar_border_color          0.954578
profile_sidebar_fill_color             0.93648
profile_background_image_url           0.73066
profile_background_color              0.903123
profile_link_color                    0.870476
protected                             0.998935
verified                               0.99929
description                           0.005104
updated                                0.99929
dataset      

In [None]:
# column having unique value count less than 10% are of no use, check column having unique value count less than 10%
unique_value_percentage.index[unique_value_percentage<0.1]                      # get all column names have unique value counts less that 10 %

Index(['name', 'screen_name', 'created_at', 'url', 'profile_image_url',
       'profile_banner_url', 'profile_image_url_https', 'description'],
      dtype='object')

In [None]:
# drop all columns (object type) having unique value count less than 10%
data_df_copy.drop(columns=unique_value_percentage.index[unique_value_percentage<0.1],inplace=True)# drop all columns having unique value counts less that 10 %
data_df_copy.describe(include='object')                                         # see again description for object type columns

Unnamed: 0,lang,time_zone,location,profile_background_image_url_https,profile_text_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,protected,verified,updated,dataset
count,2818,1069,2271,2818,2818,2818,2818,2818,2818,2818,2818,2818,2818,2818
unique,8,33,1678,770,180,128,179,759,273,365,3,2,2,2
top,en,Rome,Roma,https://si0.twimg.com/images/themes/theme1/bg.png,333333,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,2/14/2015 10:54,E13
freq,1502,581,47,1729,2324,1830,1990,1755,1876,1955,1961,1884,1481,1481


### search for missing values

In [None]:
# Missing value handling -
# If perecentage of missing value in any column is less than 5% than go for dorp those value sample-wise (row-wise), i.e., drop sample (rows)
#   having that misssing value (using pandas.DataFrame.iloc[:,column].dropna(axis=0)). Note If multiple missing values (in multiple columns)
#   are there (less that 5%) than overall drop must not be more than 5% toe 7%. If dropping sample for multiple columns is more than 5% than
#   go for values filling.
# If perecentage of missing value is between 5% to 30% (or 35%) in any column, than fill the missing values.
# If perecentage of missing value is more than 30% (or 40%) than feature-wise (column-wise) dropping have to be done, i.e., drop
#   columns having missing values more than 30% (or 40%) (using pandas.DataFrame.iloc[:,column].dropna(axis=1)).

In [None]:
# search for missing values in numerical columns and non-numerical columns
numerical_columns=data_df_copy.dtypes[data_df_copy.dtypes!='object'].index      # get all numerical columns names
non_numerical_columns=data_df_copy.dtypes[data_df_copy.dtypes=='object'].index  # get all non-numerical columns names

**handling missing values in numerical columns**

In [None]:
data_df_copy[numerical_columns].isna().sum()                                    # check for null values in numerical columns

fav_number             0
statuses_count         0
followers_count        0
friends_count          0
favourites_count       0
listed_count           0
utc_offset          1749
status                 0
dtype: int64

In [None]:
data_df_copy[numerical_columns].isna().sum()/data_df_copy.shape[0]*100          # check percentage of missing values in numerical columns

fav_number           0.000000
statuses_count       0.000000
followers_count      0.000000
friends_count        0.000000
favourites_count     0.000000
listed_count         0.000000
utc_offset          62.065295
status               0.000000
dtype: float64

In [None]:
# drop column "utc_offset" as % of missing values is more that 30%
data_df_copy.drop(columns=['utc_offset'],inplace=True)                          # make drop column -> "utc_offset"
# check again is there still any missing values in numerical columns
numerical_columns=data_df_copy.dtypes[data_df_copy.dtypes!='object'].index      # get all numerical columns names again (as one column removed)
data_df_copy[numerical_columns].isna().sum()                                    # check for null values in numerical columns again after dropping

fav_number          0
statuses_count      0
followers_count     0
friends_count       0
favourites_count    0
listed_count        0
status              0
dtype: int64

**handling missing values in non-numerical columns**

In [None]:
data_df_copy[non_numerical_columns].isna().sum()                                # check for null values in non-numerical columns

lang                                     0
time_zone                             1749
location                               547
profile_background_image_url_https       0
profile_text_color                       0
profile_sidebar_border_color             0
profile_sidebar_fill_color               0
profile_background_image_url             0
profile_background_color                 0
profile_link_color                       0
protected                                0
verified                                 0
updated                                  0
dataset                                  0
dtype: int64

In [None]:
data_df_copy[non_numerical_columns].isna().sum()/data_df_copy.shape[0]*100      # check percentage of missing values in non-numerical columns

lang                                   0.000000
time_zone                             62.065295
location                              19.410930
profile_background_image_url_https     0.000000
profile_text_color                     0.000000
profile_sidebar_border_color           0.000000
profile_sidebar_fill_color             0.000000
profile_background_image_url           0.000000
profile_background_color               0.000000
profile_link_color                     0.000000
protected                              0.000000
verified                               0.000000
updated                                0.000000
dataset                                0.000000
dtype: float64

In [None]:
# drop column "time_zone" as % of missing values is more that 30%
data_df_copy.drop(columns=['time_zone'],inplace=True)                           # make drop column -> "time_zone"
# check again is there still any missing values in numerical columns
non_numerical_columns=data_df_copy.dtypes[data_df_copy.dtypes=='object'].index  # get all non-numerical columns names again (as one column removed)
data_df_copy[non_numerical_columns].isna().sum()                                # check for null values in non-numerical columns again after dropping

lang                                    0
location                              547
profile_background_image_url_https      0
profile_text_color                      0
profile_sidebar_border_color            0
profile_sidebar_fill_color              0
profile_background_image_url            0
profile_background_color                0
profile_link_color                      0
protected                               0
verified                                0
updated                                 0
dataset                                 0
dtype: int64

In [None]:
# As pandas.DataFrame.isna -> Return a boolean same-sized object indicating if the values are NA.
#   NA values, such as None or numpy.NaN, gets mapped to True values. Everything else gets mapped to False values. Characters such as empty
#   strings '' or numpy.inf are not considered NA values (unless you set pandas.options.mode.use_inf_as_na = True).
# So object type columns may have abnormal (or missing) value like "?" or " " (single space). Thus make a check for abnormal values
# search for abnormal values in data like - "?", "-", " " (single space), "" (empty string), etc
for abnormal in ['?','-','_',' ','\t','\n','']: print(f'Count of "{abnormal}" is:',(data_df_copy==abnormal).sum().sum())
# Thus no abnormal value found !

Count of "?" is: 0
Count of "-" is: 0
Count of "_" is: 0
Count of " " is: 0
Count of "	" is: 0
Count of "
" is: 0
Count of "" is: 0


In [None]:
data_df_copy.isna().sum()                                                       # check for null / missing value in whole data

fav_number                              0
statuses_count                          0
followers_count                         0
friends_count                           0
favourites_count                        0
listed_count                            0
lang                                    0
location                              547
profile_background_image_url_https      0
profile_text_color                      0
profile_sidebar_border_color            0
profile_sidebar_fill_color              0
profile_background_image_url            0
profile_background_color                0
profile_link_color                      0
protected                               0
verified                                0
updated                                 0
dataset                                 0
status                                  0
dtype: int64

In [None]:
# Thus, "location" column missing values have to be filled, so check "location" is continuous or discrete variable.
data_df_copy['location'].head(10)                                               # check head of column having missing values

0                           Roma
1                    Rome, Italy
2                      Internets
3                            NaN
4    iPhone: 44.069630,12.569966
5                           Rome
6      Milano, Lombardia, Italia
7                            NaN
8      iPhone: 0.000000,0.000000
9                  Chioggia (VE)
Name: location, dtype: object

In [None]:
data_df_copy['location'].tail(10)                                               # check tail of column having missing values

2808     Around The World
2809                  NaN
2810    Las Vages, Navada
2811           London, UK
2812       Cleveland,Ohio
2813           Murphy, NC
2814                  NaN
2815           Rome,Italy
2816                  NaN
2817                  NaN
Name: location, dtype: object

In [None]:
# as "location" is discrete variable, so see count for each unique value in "location" columns
data_df_copy['location'].value_counts()                                         # each value count in "location" columns

Roma                          47
Milano                        40
Italy                         30
USA                           17
Italia                        16
                              ..
Kensington Garden.             1
MelÃ¨eâ„¢ Island               1
Emilia Romagna, Italy          1
in fuga dallo Chateau d'If     1
Murphy, NC                     1
Name: location, Length: 1678, dtype: int64

In [None]:
data_df_copy['location'].mode()                                                 # see most occuring value in "location" column
# Note: pandas.DataFrame.iloc[:,column].mode() -> returns Series (or pandas.DataFrame.iloc[:,column].mode().values -> numpy.ndarray)

0    Roma
Name: location, dtype: object

In [None]:
# As, pandas.DataFrame.iloc[:,column].mode() -> returns Series, so fill missing values with first value in this series
data_df_copy['location']=data_df_copy['location'].fillna(data_df_copy['location'].mode()[0])# make fill null / missing values
data_df_copy.isna().sum().sum()                                                 # check for null values again after filling values

0

In [None]:
data_df_copy.head()                                                             # see head of final data after missing value handling

Unnamed: 0,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang,location,profile_background_image_url_https,profile_text_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,protected,verified,updated,dataset,status
0,0,20370,5470,2385,145,52,it,Roma,https://si0.twimg.com/profile_background_image...,0C3E53,F2E195,FFF7CC,http://a0.twimg.com/profile_background_images/...,BADFCD,FF0000,Public,VERIFIED,2/14/2015 10:54,E13,1
1,68,3131,506,381,9,40,en,"Rome, Italy",https://si0.twimg.com/images/themes/theme1/bg.png,333333,FFFFFF,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,VERIFIED,2/14/2015 10:54,E13,1
2,7696,4024,264,87,323,16,en,Internets,https://si0.twimg.com/profile_background_image...,666666,181A1E,0,http://a0.twimg.com/profile_background_images/...,1A1B1F,2FC2EF,Public,VERIFIED,2/14/2015 10:54,E13,1
3,202,40586,640,622,1118,32,en,Roma,https://si0.twimg.com/images/themes/theme4/bg.gif,3C3940,FFFFFF,95E8EC,http://a0.twimg.com/images/themes/theme4/bg.gif,0099B9,0099B9,Public,VERIFIED,2/14/2015 10:54,E13,1
4,37318,2016,62,64,13,0,it,"iPhone: 44.069630,12.569966",https://si0.twimg.com/images/themes/theme18/bg...,333333,EEEEEE,F6F6F6,http://a0.twimg.com/images/themes/theme18/bg.gif,ACDED6,38543,Public,VERIFIED,2/14/2015 10:54,E13,1


In [None]:
data_df_copy.tail()                                                             # see tail of final data after missing value handling

Unnamed: 0,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang,location,profile_background_image_url_https,profile_text_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,protected,verified,updated,dataset,status
2813,523,1,0,17,0,0,en,"Murphy, NC",https://si0.twimg.com/images/themes/theme1/bg.png,333333,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,2/14/2015 10:40,INT,0
2814,0,0,1,17,0,0,en,Roma,https://si0.twimg.com/images/themes/theme1/bg.png,333333,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,2/14/2015 10:40,INT,0
2815,327,2,0,15,0,0,en,"Rome,Italy",https://si0.twimg.com/images/themes/theme1/bg.png,333333,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,2/14/2015 10:40,INT,0
2816,251,2,0,16,0,0,en,Roma,https://si0.twimg.com/images/themes/theme1/bg.png,333333,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,2/14/2015 10:40,INT,0
2817,35222,0,0,17,0,0,en,Roma,https://si0.twimg.com/images/themes/theme1/bg.png,333333,C0DEED,DDEEF6,http://a0.twimg.com/images/themes/theme1/bg.png,C0DEED,0084B4,Public,UNVERIFIED,2/14/2015 10:40,INT,0


### [labeling encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) (how does it works?)

In [None]:
labels=['dog','cat','bird','cat','dog','cat','bird']                            # set example labels

In [None]:
# make label encode using `sklearn.preprocessing.LabelEncoder`
from sklearn.preprocessing import LabelEncoder                                  # make load label encoder class
encoder=LabelEncoder()                                                          # make label encoder object
encoder.fit(labels)                                                             # make fit labels
encoded_labels=encoder.transform(labels)                                        # make encode labels
# make print encoded labels
for label,encoded_label in zip(labels,encoded_labels):                          # make loop to print all orginal and encoded labels
  print(f'{label} -> {encoded_label}')                                          # make print original and encoded label

dog -> 2
cat -> 1
bird -> 0
cat -> 1
dog -> 2
cat -> 1
bird -> 0


In [None]:
test_labels=['cat','bird','dog','cat']                                          # test on more labels
print(f'Labels -> {test_labels} ; Encoded labels -> {encoder.transform(test_labels)}')# make test on new test labels
# make inverse transform / decode labels (encoded labels -> orginal labels )
print(f'Encoded labels -> {encoded_labels} ; Decode labels -> {encoder.inverse_transform(encoded_labels)}')# make print decode labels

Labels -> ['cat', 'bird', 'dog', 'cat'] ; Encoded labels -> [1 0 2 1]
Encoded labels -> [2 1 0 1 2 1 0] ; Decode labels -> ['dog' 'cat' 'bird' 'cat' 'dog' 'cat' 'bird']


In [None]:
# How label encoder works ?
# Step 1: Make list of sorted unique labels. (fit method)
# Step 2: Give index according to index of value in sorted unique value list. (transform method)
class_labels=sorted(set(labels))                                                # sorted unique labels - fit method
encoded_labels_manually=[class_labels.index(label) for label in labels]         # make label encode - inverse_transform
assert list(encoded_labels)==encoded_labels_manually                            # make check / verify

### encode labels

In [None]:
data_df_copy_encoded=data_df_copy.copy()                                        # make copy of data to keep encoded labels

In [None]:
from sklearn.preprocessing import LabelEncoder                                  # make load label encoder class
encoders=dict()                                                                 # make dictionary to keep all column encoders
for column in data_df_copy.columns:                                             # get column names from data
  if data_df_copy[column].dtype=='object':                                      # check column is of object type or not
    encoders[column]=LabelEncoder()                                             # make label object of column
    data_df_copy_encoded[column]=encoders[column].fit_transform(data_df_copy[column])# make label encode column
    data_df_copy_encoded[column]=data_df_copy_encoded[column].astype('int64')   # change column type in integer
data_df_copy_encoded.head()                                                     # make print head of encoded data

Unnamed: 0,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang,location,profile_background_image_url_https,profile_text_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,protected,verified,updated,dataset,status
0,0,20370,5470,2385,145,52,5,1005,45,7,109,173,46,160,356,2,1,1,0,1
1,68,3131,506,381,9,40,1,1021,0,43,125,108,0,166,10,2,1,1,0,1
2,7696,4024,264,87,323,16,1,551,37,78,11,0,37,55,112,2,1,1,0,1
3,202,40586,640,622,1118,32,1,1005,13,49,125,65,13,5,17,2,1,1,0,1
4,37318,2016,62,64,13,0,5,1458,9,43,105,151,9,153,123,2,1,1,0,1


In [None]:
data_df_copy_encoded.tail()                                                     # make printtail of encoded data

Unnamed: 0,fav_number,statuses_count,followers_count,friends_count,favourites_count,listed_count,lang,location,profile_background_image_url_https,profile_text_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_background_image_url,profile_background_color,profile_link_color,protected,verified,updated,dataset,status
2813,523,1,0,17,0,0,1,776,0,43,75,108,0,166,10,2,0,0,1,0
2814,0,0,1,17,0,0,1,1005,0,43,75,108,0,166,10,2,0,0,1,0
2815,327,2,0,15,0,0,1,1023,0,43,75,108,0,166,10,2,0,0,1,0
2816,251,2,0,16,0,0,1,1005,0,43,75,108,0,166,10,2,0,0,1,0
2817,35222,0,0,17,0,0,1,1005,0,43,75,108,0,166,10,2,0,0,1,0


In [None]:
data_df_copy_encoded.dtypes                                                     # check data type (must be numerical)

fav_number                            int64
statuses_count                        int64
followers_count                       int64
friends_count                         int64
favourites_count                      int64
listed_count                          int64
lang                                  int64
location                              int64
profile_background_image_url_https    int64
profile_text_color                    int64
profile_sidebar_border_color          int64
profile_sidebar_fill_color            int64
profile_background_image_url          int64
profile_background_color              int64
profile_link_color                    int64
protected                             int64
verified                              int64
updated                               int64
dataset                               int64
status                                int64
dtype: object

In [None]:
data_df_copy_encoded.to_csv('cleaned_fake_profile_detection_data.csv')          # make save cleaned data as csv file

In [None]:
# make save label encoder object dictionary
import pickle                                                                   # pickle — Python object serialization
with open('encoders_fake_profile_data',mode='wb') as encoder_file_object:       # make open object file
  pickle.dump(encoders,encoder_file_object)                                     # make save dictionary object

# References

* [The Python Tutorial](https://docs.python.org/3.10/tutorial/index.html)
* [NumPy quickstart](https://numpy.org/doc/stable/user/quickstart.html)
* [10 Minutes to Pandas (Pandas, official tutorial)](https://pandas.pydata.org/docs/user_guide/10min.html)
* [Scikit Learn's Label Encoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)