In [126]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [127]:
df = pd.read_csv("course_lead_scoring.csv")

In [128]:
len(df)

1462

In [129]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [130]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

Checking for missing values

In [131]:
df.nunique()
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [132]:
df.columns = df.columns.str.replace(" ", "_")
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [133]:
string_columns = list(df.dtypes[df.dtypes == "object"].index)
string_columns

['lead_source', 'industry', 'employment_status', 'location']

In [134]:
num_columns = list(df.dtypes[df.dtypes == "int64"].index) + list(df.dtypes[df.dtypes == "float64"].index)
num_columns

['number_of_courses_viewed',
 'interaction_count',
 'converted',
 'annual_income',
 'lead_score']

In [135]:
for col in string_columns:
    df[col] = df[col].fillna("NA")

for col in num_columns:
    df[col] = df[col].fillna(0.0)

In [136]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [137]:
# The most frequent observation for Industry
df.industry.mode()

0    retail
Name: industry, dtype: object

Question 2: Correlation Matrix

In [138]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [139]:
categorical = [
    "lead_source", "industry", "employment_status", "location"
]

numerical = [
    "number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"
]

In [140]:
# Setting up validation framework
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)


In [141]:
# We reset the index and extract the y values

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [142]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [143]:
del df_train["converted"]
del df_val["converted"]
del df_test["converted"]

In [144]:
df_full_train.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [145]:
# df_full_train[categorical]
# df_full_train[categorical].nunique()

df_full_train[numerical]
# df_full_train[numerical].nunique()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
0,2,44403.0,1,0.71
1,3,38048.0,6,0.97
2,2,71399.0,1,0.51
3,2,47912.0,1,0.04
4,1,34806.0,4,0.32
...,...,...,...,...
1164,1,57039.0,2,0.30
1165,2,56185.0,2,0.44
1166,1,56402.0,1,0.02
1167,1,45688.0,3,0.02


In [146]:
# Correlation coefficient
df_full_train.converted

0       0
1       1
2       1
3       0
4       1
       ..
1164    0
1165    0
1166    0
1167    1
1168    0
Name: converted, Length: 1169, dtype: int64

In [147]:
# Correlation coefficient
# The two features with the biggest correlation
df_full_train[numerical].corrwith(df_full_train.converted)

number_of_courses_viewed    0.442068
annual_income               0.029612
interaction_count           0.378482
lead_score                  0.225641
dtype: float64

Question 3: Mutual Information

In [148]:
# We apply to all variables to know which of them is more important
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [149]:
# Biggest Mutual Information Score
score = df_full_train[categorical].apply(mutual_info_converted_score)
score.sort_values(ascending=False).round(2)

lead_source          0.03
employment_status    0.01
industry             0.01
location             0.00
dtype: float64

Question 4: Logistic regression

In [150]:
train_dicts = df_train[categorical + numerical].to_dict(orient="records")

In [151]:
train_dicts[2]

{'lead_source': 'paid_ads',
 'industry': 'technology',
 'employment_status': 'employed',
 'location': 'north_america',
 'number_of_courses_viewed': 3,
 'annual_income': 81973.0,
 'interaction_count': 2,
 'lead_score': 0.59}

In [152]:
dv = DictVectorizer(sparse=False)

In [153]:
X_train = dv.fit_transform(train_dicts)
X_train[2]

array([8.1973e+04, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 2.0000e+00,
       5.9000e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       3.0000e+00])

In [154]:
dv.get_feature_names_out()

array(['annual_income', 'employment_status=NA',
       'employment_status=employed', 'employment_status=self_employed',
       'employment_status=student', 'employment_status=unemployed',
       'industry=NA', 'industry=education', 'industry=finance',
       'industry=healthcare', 'industry=manufacturing', 'industry=other',
       'industry=retail', 'industry=technology', 'interaction_count',
       'lead_score', 'lead_source=NA', 'lead_source=events',
       'lead_source=organic_search', 'lead_source=paid_ads',
       'lead_source=referral', 'lead_source=social_media', 'location=NA',
       'location=africa', 'location=asia', 'location=australia',
       'location=europe', 'location=middle_east',
       'location=north_america', 'location=south_america',
       'number_of_courses_viewed'], dtype=object)

In [155]:
val_dicts = df_val[categorical + numerical].to_dict(orient="records")
val_dicts

[{'lead_source': 'paid_ads',
  'industry': 'healthcare',
  'employment_status': 'unemployed',
  'location': 'europe',
  'number_of_courses_viewed': 3,
  'annual_income': 52220.0,
  'interaction_count': 1,
  'lead_score': 0.07},
 {'lead_source': 'organic_search',
  'industry': 'technology',
  'employment_status': 'unemployed',
  'location': 'middle_east',
  'number_of_courses_viewed': 3,
  'annual_income': 59656.0,
  'interaction_count': 4,
  'lead_score': 0.65},
 {'lead_source': 'events',
  'industry': 'manufacturing',
  'employment_status': 'self_employed',
  'location': 'north_america',
  'number_of_courses_viewed': 0,
  'annual_income': 57134.0,
  'interaction_count': 4,
  'lead_score': 0.13},
 {'lead_source': 'events',
  'industry': 'other',
  'employment_status': 'NA',
  'location': 'asia',
  'number_of_courses_viewed': 0,
  'annual_income': 0.0,
  'interaction_count': 0,
  'lead_score': 0.03},
 {'lead_source': 'referral',
  'industry': 'retail',
  'employment_status': 'unemployed

In [156]:
X_val = dv.transform(val_dicts)
X_val

array([[5.2220e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.9656e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        3.0000e+00],
       [5.7134e+04, 0.0000e+00, 0.0000e+00, ..., 1.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [7.4166e+04, 0.0000e+00, 1.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00],
       [3.9103e+04, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        2.0000e+00],
       [4.7129e+04, 1.0000e+00, 0.0000e+00, ..., 0.0000e+00, 1.0000e+00,
        1.0000e+00]], shape=(293, 31))

In [157]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [158]:
model.coef_

array([[-1.77843869e-05, -1.47154423e-02,  3.39095225e-02,
         2.66248432e-03,  1.15238518e-02, -1.02527697e-01,
        -2.48510995e-02,  4.93604222e-02, -2.01258344e-02,
        -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
        -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,
         5.12012528e-02,  2.01511698e-02, -1.20346284e-02,
        -1.16021521e-02, -1.15251880e-01,  7.95303436e-02,
        -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
        -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,
         5.58598769e-03, -3.33967159e-02, -2.52837052e-02,
         4.53752887e-01]])

In [159]:
model.intercept_[0]

np.float64(-0.06914728027832363)

In [160]:
model.predict(X_train)
model.predict_proba(X_train)[:, 1]

array([0.57914343, 0.87283491, 0.58816107, 0.51231211, 0.63666475,
       0.75521068, 0.67660732, 0.80463891, 0.31931235, 0.52336705,
       0.42762898, 0.75635962, 0.36365335, 0.479759  , 0.66891574,
       0.79118097, 0.84805492, 0.75295599, 0.74242974, 0.44250074,
       0.59299541, 0.71532757, 0.89505926, 0.40505602, 0.41274906,
       0.49397089, 0.76167979, 0.54183228, 0.91299586, 0.84329958,
       0.43502339, 0.8413228 , 0.45134426, 0.73331191, 0.69094239,
       0.63501138, 0.78633979, 0.70340975, 0.31374645, 0.66994888,
       0.81195513, 0.54070748, 0.52703678, 0.66105373, 0.89692378,
       0.93952744, 0.69103809, 0.57569837, 0.43601078, 0.58839132,
       0.40836197, 0.62054107, 0.59360953, 0.80782455, 0.55881103,
       0.89447257, 0.85676682, 0.3290991 , 0.88454471, 0.61158866,
       0.55737895, 0.80158813, 0.29433961, 0.76846368, 0.26685382,
       0.61036624, 0.70840796, 0.76184472, 0.88772862, 0.60724473,
       0.74046877, 0.39672339, 0.94279447, 0.89334272, 0.77167

In [161]:
# Using validation datasets
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.61192163, 0.79982617, 0.53021344, 0.47131479, 0.57066131,
       0.44227168, 0.87127669, 0.84883115, 0.83290037, 0.61497801,
       0.54968026, 0.78153088, 0.69039785, 0.77017122, 0.5265944 ,
       0.91706425, 0.53170634, 0.42123048, 0.30146455, 0.84881583,
       0.79488653, 0.73670374, 0.44527211, 0.64838383, 0.4176882 ,
       0.75393418, 0.90166116, 0.33903048, 0.43181431, 0.9680681 ,
       0.92018714, 0.37487988, 0.652301  , 0.90650057, 0.75164116,
       0.64202121, 0.82250075, 0.83375553, 0.659116  , 0.30978853,
       0.78942264, 0.35546366, 0.96517758, 0.63389304, 0.51274195,
       0.53230533, 0.82287785, 0.744074  , 0.73452313, 0.68955217,
       0.46964443, 0.84539252, 0.55635243, 0.92637871, 0.65258021,
       0.61526273, 0.63816995, 0.28304018, 0.48049824, 0.57890618,
       0.35497342, 0.62175051, 0.38960778, 0.61156056, 0.85304278,
       0.75430136, 0.89185954, 0.71946459, 0.95387623, 0.89209517,
       0.75277087, 0.33850139, 0.61376593, 0.51622275, 0.64088

In [162]:
# Setting the threshold as 50%
conversion_decision = y_pred > 0.5
conversion_decision

array([ True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True, False,  True, False,  True,  True,
       False, False,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True, False, False,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
       False,  True,  True, False,  True,  True, False,  True,  True,
       False,  True,

In [163]:
# checking accuracy of our prediction
y_val

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 0])

In [164]:
conversion_decision.astype(int)

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0])

In [165]:
# To check the accuracy
accuracy = (y_val == conversion_decision).mean()
accuracy

np.float64(0.6996587030716723)

Question 5: Feature Elimination

In [166]:
# Excluding industry, employment status, and lead_score
excl_industry = ["lead_source", "employment_status", "location", "number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"]
excl_empstatus = ["lead_source", "industry", "location", "number_of_courses_viewed", "annual_income", "interaction_count", "lead_score"]
excl_leadscore = ["lead_source", "industry", "employment_status", "location", "number_of_courses_viewed", "annual_income", "interaction_count"]

excl_industry, excl_empstatus, excl_leadscore

(['lead_source',
  'employment_status',
  'location',
  'number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score'],
 ['lead_source',
  'industry',
  'location',
  'number_of_courses_viewed',
  'annual_income',
  'interaction_count',
  'lead_score'],
 ['lead_source',
  'industry',
  'employment_status',
  'location',
  'number_of_courses_viewed',
  'annual_income',
  'interaction_count'])

In [167]:
train_industry_dict = df_train[excl_industry].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_industry_dict)

X_industry = dv.transform(train_industry_dict)

In [168]:
model_industry = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_industry.fit(X_industry, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [169]:
val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)
y_pred = model_industry.predict_proba(X_val)[:, 1]
conversion_decision = y_pred > 0.5
conversion_decision.astype(int)
industry_accuracy = (y_val == conversion_decision).mean()
industry_accuracy

np.float64(0.6996587030716723)

In [170]:
# Excluding employment status

train_employment_dict = df_train[excl_empstatus].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_employment_dict)

X_employment = dv.transform(train_employment_dict)

model_employment = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_employment.fit(X_employment, y_train)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)
y_pred = model_employment.predict_proba(X_val)[:, 1]
conversion_decision = y_pred > 0.5
conversion_decision.astype(int)
employment_accuracy = (y_val == conversion_decision).mean()
employment_accuracy

np.float64(0.6962457337883959)

In [171]:
# Excluding lead_score 

train_score_dict = df_train[excl_leadscore].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_score_dict)

X_score = dv.transform(train_score_dict)

model_score = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_score.fit(X_score, y_train)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)
y_pred = model_score.predict_proba(X_val)[:, 1]
conversion_decision = y_pred > 0.5
conversion_decision.astype(int)
score_accuracy = (y_val == conversion_decision).mean()
score_accuracy

np.float64(0.7064846416382252)

In [172]:
# We compute the differences
score_diff = score_accuracy - accuracy
employment_diff = employment_accuracy - accuracy
industry_diff = industry_accuracy - accuracy

print(score_diff, employment_diff, industry_diff)
industry_diff > employment_diff

0.0068259385665528916 -0.0034129692832763903 0.0


np.True_

Question 6: Regularization

In [173]:
train_dicts = df_train[categorical + numerical].to_dict(orient="records")
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

C_values = [0.01, 0.1, 1, 10, 100]
for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train, y_train)

    y_pred = model_reg.predict_proba(X_val)[:, 1]
    conversion_decision = (y_pred > 0.5).astype(int)

    accuracy = (y_val == conversion_decision).mean()
    print(f"C={C:<5} -> Validation Accuracy: {accuracy:.4f}")

C=0.01  -> Validation Accuracy: 0.6997
C=0.1   -> Validation Accuracy: 0.6997
C=1     -> Validation Accuracy: 0.6997
C=10    -> Validation Accuracy: 0.6997
C=100   -> Validation Accuracy: 0.6997


In [175]:
train_dicts = df_train[categorical + numerical].to_dict(orient="records")
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)



model_reg = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model_reg.fit(X_train, y_train)

y_pred = model_reg.predict_proba(X_val)[:, 1]
conversion_decision = (y_pred > 0.5).astype(int)

accuracy = (y_val == conversion_decision).mean()
print(accuracy)

0.6996587030716723
