In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
input_df = pd.read_csv(filepath_or_buffer=r'.\suv\User_Data.csv')
input_df.head()
X = input_df[["User ID","Gender","Age","EstimatedSalary"]]
y = input_df[['Purchased']]

In [3]:
categorical_feature_mask=X.dtypes == object
categorical_masked_cols = X.columns[categorical_feature_mask].tolist()
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer([('encoder', 
                         OneHotEncoder(categories=[['Male','Female']],drop='first'), 
                         categorical_masked_cols)], remainder='passthrough')
cat_data = np.array(ct.fit_transform(X[categorical_masked_cols]), dtype=np.int)
X = pd.DataFrame(data=cat_data,columns=['Encoded_Gender']).join(X.loc[:, X.columns != 'Gender'])

In [4]:
from sklearn.ensemble import ExtraTreesClassifier
feature_selector = ExtraTreesClassifier()
feature_selector.fit(X,y)
pd.Series(data=feature_selector.feature_importances_, index=[X.columns])

Encoded_Gender     0.014635
User ID            0.123441
Age                0.474119
EstimatedSalary    0.387805
dtype: float64

In [5]:
X = X[['Age', 'EstimatedSalary']]

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X =  pd.DataFrame(data=scaler.fit_transform(X), columns=[X.columns])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)

In [8]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
classifier.score(X_test, y_test)

0.89

In [9]:
parameters = [{'C' : [.01,.05,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]},
              {'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
              {'max_iter' : [50, 100, 200, 300, 400, 500]},
              {'penalty' : ['l1', 'l2', 'elasticnet', None]}
             ]

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
search = GridSearchCV(estimator=classifier, param_grid=parameters, cv=5, n_jobs=-1)

In [12]:
search.fit(X=X_train, y=y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                0.8, 0.9, 1]},
                         {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                     'saga']},
                         {'max_iter': [50, 100, 200, 300, 400, 500]},
                         {'penal

In [14]:
search.best_estimator_

LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
classifier.fit(X_train, y_train)

LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
classifier.score(X_test, y_test)

0.9