In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import confusion_matrix

In [2]:
%store -r clean_data

In [3]:
clean_data

Unnamed: 0,pseudo_id,mission_id,mission_difficulty,mission_stars_collected,day_auto_increment,lifetime_played_runs,max_run_distance,total_purchases_virtual,total_ads_watched,total_purchases_real,geo_country,days_played_in_month,virtual_currency_balance,target_max_day_played,mission_played,sum_ads,sum_purchases_real,sum_purchases_virtual
0,0,94,2,3,0,2,1763,2000,0,0,Honduras,0,5000,4,1,0,0,54500
1,0,109,1,4,0,3,2266,2000,0,0,Honduras,0,4238,4,2,0,0,54500
2,0,115,1,5,0,8,2266,3500,0,0,Honduras,0,3561,4,3,0,0,54500
3,0,114,1,6,2,9,2266,3500,0,0,Honduras,2,4284,4,4,0,0,54500
4,0,3,1,7,2,12,2266,7000,0,0,Honduras,2,1306,4,5,0,0,54500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2664754,90482,109,1,8,0,4,3743,0,2,0,United Kingdom,0,8078,2,6,15,0,1500
2664755,90482,11,1,8,0,4,3743,0,2,0,United Kingdom,0,8078,2,7,15,0,1500
2664756,90482,6,1,10,0,5,3743,0,3,0,United Kingdom,0,10336,2,8,15,0,1500
2664757,90482,19,2,10,0,5,3743,0,3,0,United Kingdom,0,10336,2,9,15,0,1500


In [4]:
clean_data['real_purchase'] = np.where(clean_data['sum_purchases_real'] > 0, True, False)

In [5]:
clean_data['real_purchase'].value_counts()

False    2612614
True       52145
Name: real_purchase, dtype: int64

In [6]:
X = clean_data.drop(['real_purchase', 'geo_country'],axis=1)
y = clean_data['real_purchase']

X_train, X_test, y_train, y_test = train_test_split(X,y)

numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train= X_train.select_dtypes(object)
categoricals_test= X_test.select_dtypes(object)

transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

LR = LinearRegression().fit(X_train,y_train)

score = LR.score(X_test, y_test)
score

0.15823365312384008

In [7]:
X = clean_data.drop(['real_purchase', 'total_purchases_real', 'pseudo_id', 'sum_purchases_real', 'geo_country'],axis=1)
y = clean_data['real_purchase']

X_train, X_test, y_train, y_test = train_test_split(X,y)

numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train= X_train.select_dtypes(object)
categoricals_test= X_test.select_dtypes(object)

transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)

LR.score(X_test, y_test)

0.9803539530764497

In [8]:
predictions = LR.predict(X_test)

In [9]:
predictions = pd.DataFrame(predictions, columns = ['pred'])

In [10]:
cm = confusion_matrix(y_test, predictions)
cm

array([[653102,      0],
       [ 13088,      0]], dtype=int64)

In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [13]:
X_train_SMOTE.shape

(3919024, 14)

In [14]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.022166475856960063
recall:  0.5029798288508558
f1:  0.042461653572764685


In [15]:
cm = confusion_matrix(y_test, pred)
cm

array([[362705, 290397],
       [  6505,   6583]], dtype=int64)

In [16]:
from sklearn.feature_selection import RFE
rfe = RFE(LR, n_features_to_select = 1, verbose = False)
rfe.fit(X_train, y_train)

RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=1,
    verbose=False)

In [17]:
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = X.columns
df = df[df['Rank']==1]

In [18]:
df

Unnamed: 0,Rank,Column_name
5,1,max_run_distance


The variable that helps predict best the result is "max_run_distance"

In [19]:
from sklearn.feature_selection import RFE
rfe = RFE(LR, n_features_to_select = 10, verbose = False)
rfe.fit(X_train, y_train)

RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=10,
    verbose=False)

In [20]:
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = X.columns
# df = df[df['Rank']==1]
df

Unnamed: 0,Rank,Column_name
0,2,mission_id
1,1,mission_difficulty
2,1,mission_stars_collected
3,1,day_auto_increment
4,1,lifetime_played_runs
5,1,max_run_distance
6,1,total_purchases_virtual
7,4,total_ads_watched
8,5,days_played_in_month
9,1,virtual_currency_balance


In [21]:
best_cols = []
for i in df['Column_name']:
    best_cols.append(str(i))
best_cols

['mission_id',
 'mission_difficulty',
 'mission_stars_collected',
 'day_auto_increment',
 'lifetime_played_runs',
 'max_run_distance',
 'total_purchases_virtual',
 'total_ads_watched',
 'days_played_in_month',
 'virtual_currency_balance',
 'target_max_day_played',
 'mission_played',
 'sum_ads',
 'sum_purchases_virtual']

In [22]:
X = clean_data[(best_cols)]
y = clean_data['real_purchase']
X_train, X_test, y_train, y_test = train_test_split(X,y)

numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)

categoricals_train= X_train.select_dtypes(object)
categoricals_test= X_test.select_dtypes(object)

transformer = StandardScaler().fit(numericals_train)
numericals_train_standardized = transformer.transform(numericals_train)
numericals_test_standardized = transformer.transform(numericals_test)

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categoricals_train)
categoricals_train_encoded = encoder.transform(categoricals_train).toarray()
categoricals_test_encoded = encoder.transform(categoricals_test).toarray()

X_train = np.concatenate((numericals_train_standardized,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_standardized,categoricals_test_encoded),axis=1)


In [23]:
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [24]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.02201746166809587
recall:  0.5098520279076899
f1:  0.04221203773057929
