In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
import warnings
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, accuracy_score
from sklearn.metrics import precision_score

warnings.filterwarnings('ignore')

#dropping columns with missing value more than 50% or one value or all different value

microsoft_data =pd.read_csv('Microsoft_Data.csv')
missing_value_df=pd.DataFrame({'Columns Name':microsoft_data.columns,
                  'No. of missing values': microsoft_data.isnull().sum(),
                  '% of missing values': (microsoft_data.isnull().sum()/microsoft_data.shape[0])*100})
missing_value_df.sort_values(by='% of missing values', ascending=False)

for col in microsoft_data.columns:
    if ((microsoft_data[col].isnull().sum()/microsoft_data.shape[0])*100) > 50 or len(microsoft_data[col].unique())==1 or len(microsoft_data[col].unique())==microsoft_data.shape[0]:
        microsoft_data.drop(columns=col, inplace=True)  
        
Y=microsoft_data['HasDetections']
microsoft_data.drop(columns='HasDetections', inplace=True)


In [9]:
# changing columns dtype to object

for col in microsoft_data.columns:
    if microsoft_data[col].dtypes=='float64' or microsoft_data[col].dtypes=='int64':
        microsoft_data[col]=microsoft_data[col].astype('object')
        
        
#filling nan values with mode and converting it datatype back to object

for col in microsoft_data.columns:
    microsoft_data[col]=microsoft_data[col].fillna(microsoft_data[col].mode()[0])
    
for col in microsoft_data.columns:
    if microsoft_data[col].dtypes=='float64' or microsoft_data[col].dtypes=='int64':
        microsoft_data[col]=microsoft_data[col].astype('object')
        
# for remomving unecessary columns

for col in microsoft_data.columns:
    if len(microsoft_data[col].unique())==1 or len(microsoft_data[col].unique())==microsoft_data.shape[0]:
        microsoft_data.drop(columns=col, inplace=True)

In [10]:
# testing relation using CHI Square


p_value=[]
column_name=[]

for col in microsoft_data.columns:
    crosstab = pd.crosstab(microsoft_data[col],Y)
    #print(crosstab)
    result=scipy.stats.chi2_contingency(crosstab)
    p_value.append(result[1])
    column_name.append(col)
    
p_value_df=pd.DataFrame({'ColumnNames': column_name,
                        'P-Value': p_value})


In [11]:
# dropping columns whose p values is greater than .05

for i in range(0, len(p_value_df)):
    if p_value_df['P-Value'][i] < 0.05:
        p_value_df.drop(index=i, inplace=True)
        
for col in p_value_df.ColumnNames:
    microsoft_data.drop(columns=col, inplace=True)
    

In [18]:
# train test spliiting

X_train,X_test,Y_train,Y_test=train_test_split(microsoft_data,Y,test_size=0.20,random_state=42)

In [14]:
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)


In [15]:
# label encoding

        
le=LabelEncoderExt()

for col in microsoft_data.columns:
    le.fit(X_train[col]) 
    X_train[col]=le.transform(X_train[col])
    X_test[col]=le.transform(X_test[col])

In [16]:
#GRID SEARCH CV withmodel as DEcisionTree

decision_tree= DecisionTreeClassifier(criterion='entropy')
param_dict={'max_depth':[3,4,5],
           'min_samples_split':[2,3,4,5,6],
           'min_samples_leaf':[3,6,7,8,9]}
grid_search=GridSearchCV(decision_tree,param_grid=param_dict,cv=5)
grid_search.fit(X_train,Y_train)
grid_pred=grid_search.predict(X_test)
#dt1=DecisionTreeClassifier(random_state=42,max_depth=4,min_samples_leaf=7,min_samples_split=2)
#dt1.fit(X_train,Y_train)
#best_tree_pred=dt1.predict(X_test)

print(accuracy_score(Y_test,grid_pred))
#accuracy_score(Y_test,best_tree_pred)

0.5818357998590556


In [17]:
#GRid Search CV with model as Random Forest
random_forest=RandomForestClassifier(criterion='gini',oob_score=False,random_state=42)
param_dict_random={'max_depth':[3,4,5],
           'min_samples_split':[2,3,4,5,6],
           'min_samples_leaf':[3,6,7,8,9],
                  'n_estimators':[10,20,30,40]}
grid_search_random=GridSearchCV(random_forest,param_grid=param_dict_random,cv=5)
grid_search_random.fit(X_train,Y_train)
grid_pred_random=grid_search_random.predict(X_test)
accuracy_score(Y_test,grid_pred_random)

0.5880902043692742