In [None]:
# module import
import pandas as pd
import pdpipe as pdp
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# label encoding dictionary
dict_label_encoding = {'Yes': 1, 'No': 0}

# will save DataFrames
df_list = []

for df_str in ['social_media_train.csv', 
               'social_media_test.csv',
               'social_media_aim.csv']:
    
    # data read in
    df = pd.read_csv(df_str, index_col=[0])

    # label encoding    
    df.loc[:, 'profile_pic'] = df.loc[:, 'profile_pic'].replace(dict_label_encoding)
    df.loc[:, 'extern_url'] = df.loc[:, 'extern_url'].replace(dict_label_encoding)
    df.loc[:, 'private'] = df.loc[:, 'private'].replace(dict_label_encoding)

    # append to list
    df_list.append(df)
    

# creating data sets
df_train = df_list[0]
df_test = df_list[1]
df_aim = df_list[2]

# one-hot encoding
onehot = pdp.OneHotEncode(["sim_name_username"], drop_first=False) # drop_first=False, um Kollineraität zu vermeiden!
df_train = onehot.fit_transform(df_train) # always fit on train set only!
df_test = onehot.transform(df_test)
df_aim = onehot.transform(df_aim)

# look at data
df_train.head()

In [None]:
# Lösung:
features_train = df_train.iloc[:, 1:]
target_train = df_train.iloc[:, 0]

In [None]:
# Lösung:
pipeline_log = Pipeline([('scaler', StandardScaler()),
                         ('classifier', LogisticRegression(solver='saga',
                                                           max_iter=1e4, 
                                                           random_state=42))])

In [None]:
# Lösung:
import numpy as np
np.set_printoptions(suppress=True)  # avoid scientific notation

C_values = np.geomspace(start=0.001, stop=1000, num=14)

print(C_values)

In [None]:
# Lösung:
search_space_grid = [{'classifier__penalty': ['l1', 'l2'],
                      'classifier__C': C_values}]

In [None]:
# Lösung:
model_grid = GridSearchCV(estimator=pipeline_log,
                          param_grid=search_space_grid,
                          scoring='roc_auc',
                          cv=5,
                          n_jobs=-1)

In [None]:
from sklearn.exceptions import DataConversionWarning
import warnings

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
# Lösung:
model_grid.fit(features_train, target_train)

print(model_grid.best_estimator_)
print(model_grid.best_score_)

**2. Modellevaluation mit Testdaten**

In [None]:
# Lösung:

features_test = df_test.iloc[:, 1:]
target_test = df_test.iloc[:, 0]

target_test_pred_proba = model_grid.predict_proba(features_test)

roc_auc_score(target_test, target_test_pred_proba[:, 1])

**3. Vorhersage der Aimdaten**

In [None]:
# Lösung:
features_aim = df_aim.copy()
df_aim.loc[:, 'fake_pred_proba'] = model_grid.predict_proba(features_aim)[:, 1]
df_aim.loc[:, 'fake_pred'] = model_grid.predict(features_aim)

# avoid scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_aim