In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows',None)


# loading the data
def load_data(path):
  print("Loading Data.!")
  data = pd.read_csv(path)
  print("Data succesfully loaded.!")
  return data

path=r'Raw_Data\DATA.csv'
data = load_data(path)

#--------------------------------------------------------------------------------------

# set Id column as index
def set_Id_as_index(data):
  """
    This method makes the Id column as index
  """
  print("Set ID as Index.!")
  data = data.set_index('Id')
  print("ID set as index succesfull.!")
  return data

print("\n")
data = set_Id_as_index(data)
data.head(2)

#--------------------------------------------------------------------------------------

# remove duplicate rows
def remove_duplicated_rows(df):
  """
    This method removes the duplicate rows from the data.
  """
  print("Remove Duplicate rows started.!")
  duplicated_values_details = df.duplicated().sum()
  print("Duplicate Rows present in data: ",duplicated_values_details)
  print('Data Shape With Duplicated: ',df.shape)
  shape_with_duplicate = df.shape
  new_df = df.drop_duplicates()
  print('Data Shape Without Duplicated: ',new_df.shape)
  shape_without_duplicate = new_df.shape
  print("No. Duplicated Rows Removed: ",shape_with_duplicate[0]-shape_without_duplicate[0])
  print("Duplicate Rows in data: ",df.duplicated().sum())
  print("Duplicate rows removed Succesfully.!")
  return new_df

print("\n")
data=remove_duplicated_rows(data)

#--------------------------------------------------------------------------------------

# splitting the data in independent variables(X) and dependent variables (y)
def split_data_into_X_y(data):
  """This method divides the data into independent variables(X) and dependent variables (y).
  -> Takes data frame as input
  <- Returns data frames of X, y
  """
  print("Spliting the Data into Independent Variable(X),Dependent Variable y started.!")
  X = data.loc[:,data.columns != 'Response']
  y= data['Response']
  print("Data Splited in into Independent Variable(X),Dependent Variable y successfully.!")
  return X,y

print("\n")
X,y = split_data_into_X_y(data)

#--------------------------------------------------------------------------------------

# split the data for training and testing
def train_test_split_X_y(X,y):
  """This method divides  independent variables(X) and dependent variables (y)
     into X_train,X_test,y_train,y_test with 30% data for testing and 70% data for training.
     -> Takes data frame X,y as input 
     <- Returns X_train(data for traing),X_test(data for testing),y_train(data for traing),y_test(data for testing)
  """
  print("Train Test Split Started.!")
  X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.30, random_state=42)
  
  return X_train,X_test,y_train,y_test

print("\n")
X_train,X_test,y_train,y_test= train_test_split_X_y(X,y) 

print("Shape of X_train: ",X_train.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of X_test : ",X_test.shape)
print("Shape of y_test : ",y_test.shape)
print("Train Test Split Succesfull.!")

#--------------------------------------------------------------------------------------

print("\n")
# checking for missing values in all the data columns
def checking_missing_values(df):
  """This function gives details of missing values and return a dataframe
     with two columns, missing values count and the percentage of missing values.
     -> Takes original dataframe as parameter
     <- Returns dataframe with details of missing values
  """
  missing_values_details = pd.DataFrame(df.isnull().sum(),columns=['missing_values']) 
  missing_values_details['missing_values_%']=df.isnull().sum()/len(df)*100
  missing_values_details = missing_values_details.sort_values(ascending=False,by='missing_values_%')
  missing_value_details = missing_values_details[missing_values_details['missing_values']!=0]
  return missing_value_details

missing_value_details = checking_missing_values(X_train)
print("Missing Value details in X_train: \n",missing_value_details)
print("\n")
missing_value_details = checking_missing_values(X_test)
print("Missing Value details in X_test: \n",missing_value_details)
print("\n")

# removing columns with more than 50% missing values in all the data columns
def remove_missing_value_columns(df,per_missing_value):
  """This method removes column with more than 50 % missing values.
     <- Return data frame with colums removed with more than 50% percent missing values
  """
  missing_value_details = checking_missing_values(df)
  missing_values_col_list = missing_value_details[missing_value_details['missing_values_%']>per_missing_value].index.values.tolist()
  total_missing_values = missing_value_details[missing_value_details['missing_values_%']!=0].index.values.tolist()
  print("Total Missing Values Columns List : ",total_missing_values)
  print("Total Number of columns with missing values: ",len(total_missing_values))
  print(f"Number Columns with missing values more than {per_missing_value}%: ",len(missing_values_col_list))
  print("Shape of Data Before droping column: ",df.shape)
  new_df = df.drop(columns=missing_values_col_list,axis=1)
  print("Shape of Data After droping column: ",new_df.shape)
  
  missing_value_details = checking_missing_values(new_df)
  #missing_values_col_list = missing_value_details[missing_value_details['missing_values_%']>per_missing_value].index.values.tolist()
  total_missing_values = missing_value_details[missing_value_details['missing_values_%']!=0].index.values.tolist()
  print("Total Missing Values Columns List After droping the colums: ",total_missing_values)
  print("Total Number of columns with missing values left: ",len(total_missing_values))
  print("No. of columns droped: ",df.shape[1]-new_df.shape[1])
  return new_df

print("For X_train..........Droping Columns with missing values more than 40%.!")
X_train = remove_missing_value_columns(X_train,40)
print("For X_train..........Droping Columns Succesfull.!")
print("For X_test..........Droping Columns with missing values more than 40%.!")
X_test = remove_missing_value_columns(X_test,40)
print("For X_train..........Droping Columns Succesfull.!")


print("\n")
print("X_train shape after removing missing value columns: ",X_train.shape)
print("X_train shape after removing missing value columns: ",X_test.shape)
print("\n")
print("Now Start Imputing the missing value for columns having less than 40% missing values.! ")
print("After analyzing distributions of columns, we will adopt two strategies to fill missing values: -")
print("1.Fill with medain")
print("2.Fill with random values of that same column.")
print("\n")

#--------------------------------------------------------------------------------------

print("Missing values Imputation Started.!")


# filling missing values with median for columns Medical_History_1,Employment_Info_4,Employment_Info_1
def imputing_missing_values(train,test):
  imputer1 = SimpleImputer(fill_value=np.nan,strategy='median')
  imputer2 = SimpleImputer(fill_value=np.nan,strategy='median')
  imputer3 = SimpleImputer(fill_value=np.nan,strategy='median')
    

  print("Imputation in X_train started.!")
  print("Before impution null values Medical_History_1: ",train['Medical_History_1'].isnull().sum())
  print("Before impution null values Employment_Info_4: ",train['Employment_Info_4'].isnull().sum())
  print("Before impution null values Employment_Info_1: ",train['Employment_Info_1'].isnull().sum())

  train['Medical_History_1'] = imputer1.fit_transform(train[['Medical_History_1']])
  train['Employment_Info_4'] = imputer2.fit_transform(train[['Employment_Info_4']])
  train['Employment_Info_1'] = imputer3.fit_transform(train[['Employment_Info_1']])

  print("After impution null values Medical_History_1: ",train['Medical_History_1'].isnull().sum())
  print("After impution null values Employment_Info_4: ",train['Employment_Info_4'].isnull().sum())
  print("After impution null values Employment_Info_1: ",train['Employment_Info_1'].isnull().sum())


  # filling missing values in column Family_Hist_4,Employment_Info_6 with the random-sample-imputation values
  print("Before impution null values Family_Hist_4     : ",train['Family_Hist_4'].isnull().sum())
  print("Before impution null values Employment_Info_6 : ",train['Employment_Info_6'].isnull().sum())

  train['Family_Hist_4'][train['Family_Hist_4'].isnull()] = train['Family_Hist_4'].dropna().sample(train['Family_Hist_4'].isnull().sum()).values
  train['Employment_Info_6'][train['Employment_Info_6'].isnull()] = train['Employment_Info_6'].dropna().sample(train['Employment_Info_6'].isnull().sum()).values

  print("After impution null values Family_Hist_4     : ",train['Family_Hist_4'].isnull().sum())
  print("After impution null values Employment_Info_6 : ",train['Employment_Info_6'].isnull().sum())
  print("Imputation in X_train completed Succesfully.!")
  print("")
  # ----------------------------------------------------------------------------------------------------------------
  print("Imputation in X_test started.!")
  print("Before impution null values Medical_History_1: ",test['Medical_History_1'].isnull().sum())
  print("Before impution null values Employment_Info_4: ",test['Employment_Info_4'].isnull().sum())
  print("Before impution null values Employment_Info_1: ",test['Employment_Info_1'].isnull().sum())

  test['Medical_History_1'] = imputer1.transform(test[['Medical_History_1']])
  test['Employment_Info_4'] = imputer2.transform(test[['Employment_Info_4']])
  test['Employment_Info_1'] = imputer3.transform(test[['Employment_Info_1']])

  print("After impution null values Medical_History_1: ",test['Medical_History_1'].isnull().sum())
  print("After impution null values Employment_Info_4: ",test['Employment_Info_4'].isnull().sum())
  print("After impution null values Employment_Info_1: ",test['Employment_Info_1'].isnull().sum())


  # filling missing values in column Family_Hist_4,Employment_Info_6 with the random-sample-imputation values
  print("Before impution null values Family_Hist_4     : ",test['Family_Hist_4'].isnull().sum())
  print("Before impution null values Employment_Info_6 : ",test['Employment_Info_6'].isnull().sum())

  test['Family_Hist_4'][test['Family_Hist_4'].isnull()] = test['Family_Hist_4'].dropna().sample(test['Family_Hist_4'].isnull().sum()).values
  test['Employment_Info_6'][test['Employment_Info_6'].isnull()] = test['Employment_Info_6'].dropna().sample(test['Employment_Info_6'].isnull().sum()).values

  print("After impution null values Family_Hist_4     : ",test['Family_Hist_4'].isnull().sum())
  print("After impution null values Employment_Info_6 : ",test['Employment_Info_6'].isnull().sum())
   
  print("Imputation in X_test completed Succesfully.!")


  return train,test

X_train,X_test = imputing_missing_values(X_train,X_test)

print("\n")
missing_value_details = checking_missing_values(X_train)
print("Missing Value details in X_train: \n",missing_value_details)
print("\n")
missing_value_details = checking_missing_values(X_test)
print("Missing Value details in X_test: \n",missing_value_details)
print("\n")



#--------------------------------------------------------------------------------------

# transforming the column Product_Info_2 into two columns.
def transform_Product_Info_2(df):
  """This function converts Product_Info_2 into two columns 
     Product_Info_2_alpha and Product_Info_2_digit.

      Ex - Product_Info_2      Product_Info_2_alpha    Product_Info_2_digit
                D3	                 D                         3
                A1                   A                         1
     -> Takes original data frame ['Product_Info_2'] as input 
     <- Returns a new dataframe with two new columns created from Product_Info_2 
  """
  Product_Info_2_alpha=[]
  Product_Info_2_digit=[]
  for i in df:
    Product_Info_2_alpha.append(i[0])
    Product_Info_2_digit.append(i[1])
  new_df_Product_Info_2 = pd.DataFrame(index = df.index)
  new_df_Product_Info_2 = new_df_Product_Info_2.assign(Product_Info_2_alpha=Product_Info_2_alpha,Product_Info_2_digit=Product_Info_2_digit)
  return new_df_Product_Info_2


# replacing Product_Info_2 with transformed columns feature enginnering.
def get_extracted_Product_Info_2(transformed_Product_Info_2,original_df):
  """This method drops the Product_Info_2 column and 
     adds the two new columns Product_Info_2_alpha and Product_Info_2_digit.
     -> Takes newly transformed datafame which we created from transform_Product_Info_2 function as input and original data frame
     <- Returns new data frame for futher processing of project. 
  """
  new_transformed_df = pd.concat([transformed_Product_Info_2,original_df],axis=1)
  new_transformed_df = new_transformed_df.drop(columns=['Product_Info_2'],axis = 1)
  return new_transformed_df


print("\n")
print("X_train shape Before extracting columns from Product_Info_2: ",X_train.shape)
print("X_train shape Before extracting columns from Product_Info_2: ",X_test.shape)
print("\n")


print("Started Extracting new columns from Product_Info_2 for X_train.!")
original_df = X_train
transform_Product_Info_21 = transform_Product_Info_2(X_train['Product_Info_2'])
X_train = get_extracted_Product_Info_2(transform_Product_Info_21,original_df)
print("Extraction of new columns from Product_Info_2 for X_train Completed Successfully.!")


print("Started Extracting new columns from Product_Info_2 for X_test.!")
original_df = X_test
transform_Product_Info_21 = transform_Product_Info_2(X_test['Product_Info_2'])
X_test = get_extracted_Product_Info_2(transform_Product_Info_21,original_df)
print("Extraction of new columns from Product_Info_2 for X_test Completed Successfully.!")

print("\n")
print("X_train shape after extracting columns from Product_Info_2: ",X_train.shape)
print("X_train shape after extracting columns from Product_Info_2: ",X_test.shape)
print("\n")

#--------------------------------------------------------------------------------------

from sklearn.preprocessing import OrdinalEncoder

# get the categories from ordinal columns
def get_Categories(df,column_name):
  """This method will find the list of all the categories in the column.
     -> Takes the data frame and column name 
     <- Returns the list of categories in the column named.
  """
  categories_list = df[column_name].unique().tolist()
  return categories_list


# encoding the ordinal categorical columns
def encoding_ordinal_column(train,test,list_of_categories,column_name):
  """This method will encode the Product_Info_2_alpha into numerical column using Ordinal Encoding .
     -> Takes train,test data frames, list of categores and column name
     <- Returns the train and test data frame encoded.
  """
  oe = OrdinalEncoder(categories=[list_of_categories])
  oe.fit(train[[column_name]])
  train[column_name] = oe.transform(train[[column_name]])
  test[column_name] = oe.transform(test[[column_name]])
  return train,test

print("\n")
print("Encoding Ordinal categorical column 'Product_Info_2_alpha' to numerical column using Ordinal Encoding.! ")

print("Getting the categories.!")
categories_list = get_Categories(X_train,'Product_Info_2_alpha')
print("Categories for the column are: ",categories_list)
print("Encoding Started.!")
X_train,X_test = encoding_ordinal_column(X_train,X_test,categories_list,'Product_Info_2_alpha')
print("Encoding Completed Successfully.!")
print("\n")

#-----------------------------------------------------------------------------------------------------------------

from sklearn.preprocessing import StandardScaler
# scaling the data
def scale_data(train,test):
  """This method Standardzies the data using StandardScaler
     -> Takes trainand test data frames 
     <- Returns train_scaled and test_scaled data frames.
  """
  sc = StandardScaler()
  sc.fit(train)
  X_train_scaled = sc.transform(train)
  X_test_scaled = sc.transform(test)
  X_train_scaled = pd.DataFrame(data=X_train_scaled,columns=train.columns,index = train.index)
  X_test_scaled = pd.DataFrame(data=X_test_scaled,columns=test.columns,index = test.index)
  return X_train_scaled,X_test_scaled

print("Standardizing Data")
print("Converting the data into Scaled values Started.!")
X_train_scaled,X_test_scaled =  scale_data(X_train,X_test)
print("Converting the data into Scaled values Completed Successfully.!")
print("******************************************************************")
print("We completed preprocessing the Data.")
print("Finally our Data is ready for Training.!!")
print("Export the Data to csv so that we can now train our Model.!")

#------------------------------------------------------------------------------------------------

def saving_csv_file(X_train,X_test,y_train,y_test):
  """This method will save all the processed file for Training purpose.
  """ 
  os.makedirs(r'C:\Users\Alkashi\Desktop\GL Capstone Project\Project Folder\Data_For_Training/',exist_ok=True)
  X_train.to_csv(r'C:\Users\Alkashi\Desktop\GL Capstone Project\Project Folder\Data_For_Training/X_train.csv')
  X_train.to_csv(r'C:\Users\Alkashi\Desktop\GL Capstone Project\Project Folder\Data_For_Training/X_test.csv')
  y_train.to_csv(r'C:\Users\Alkashi\Desktop\GL Capstone Project\Project Folder\Data_For_Training/y_train.csv')
  y_test.to_csv(r'C:\Users\Alkashi\Desktop\GL Capstone Project\Project Folder\Data_For_Training/y_test.csv')

print("\n")
print("Saving the files for Training")
saving_csv_file(X_train,X_test,y_train,y_test)
print("Saving the files for Training Completed Successfully.!!!!!!!!!!")


Loading Data.!
Data succesfully loaded.!


Set ID as Index.!
ID set as index succesfull.!


Remove Duplicate rows started.!
Duplicate Rows present in data:  30
Data Shape With Duplicated:  (59381, 127)
Data Shape Without Duplicated:  (59351, 127)
No. Duplicated Rows Removed:  30
Duplicate Rows in data:  30
Duplicate rows removed Succesfully.!


Spliting the Data into Independent Variable(X),Dependent Variable y started.!
Data Splited in into Independent Variable(X),Dependent Variable y successfully.!


Train Test Split Started.!
Shape of X_train:  (41545, 126)
Shape of y_train:  (41545,)
Shape of X_test :  (17806, 126)
Shape of y_test :  (17806,)
Train Test Split Succesfull.!


Missing Value details in X_train: 
                      missing_values  missing_values_%
Medical_History_10            41172         99.102178
Medical_History_32            40747         98.079191
Medical_History_24            38926         93.695992
Medical_History_15            31196         75.089662
Family_