In [2]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
import random
import re
import string
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder


from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

# Example of connection with database

In [4]:
database = ''
user = ''
password = ''
host = ''
port = 0

In [5]:
CONNECTION = f"postgresql://{user}:{password}@{host}:{port}/{database}"

# Reading data

In [7]:
user_data = pd.read_sql(
    """ SELECT * FROM public.user_data;""",
    con=CONNECTION
)
user_data

In [8]:
post_data = pd.read_sql(
    """ SELECT * FROM public.post_text_df;""",
    con=CONNECTION
)
post_data.head(10)

In [9]:
# Base consists of 76.892.800 notes, but using only 2.000.000
whole_data = pd.read_sql(
    """ SELECT * 
        FROM public.feed_data as fa 
        LIMIT 2000000
        """,
    con=CONNECTION
)

whole_data

# Data processing

## post_data

In [10]:
lematizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map part-of-speech tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def del_letter_next_number(line):
    """delete letters next to numbers and numbers themselves"""
    list_inds_del = []
    skip_inds = 0
    len_line = len(line)
    list_nums = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for ind in range(len_line):
        if skip_inds != 0:
            skip_inds -= 1
            continue
        if line[ind] in list_nums:
            len_syms = 0
            len_nums = 0
            while (ind - 1 - len_syms >= 0) and (line[ind - 1 - len_syms] != ' '):
                len_syms += 1
            if (ind + 1 < len_line) and line[ind + 1] != ' ':
                len_nums += 1
                while (ind + len_nums < len_line) and line[ind + len_nums] != ' ':
                    len_nums += 1
            if len_nums == 0:
                list_inds_del.append((ind - len_syms, ind + 1))
            else:
                list_inds_del.append((ind - len_syms, ind + len_nums))
            skip_inds = len_nums          
    diff_i_j = 0
    for i, j in list_inds_del:
        line = line[:i - diff_i_j] + line[j - diff_i_j:]
        diff_i_j += j - i
    return line

def preprocessing(line, token=lematizer):
    """delete punctuation and apply WordNetLemmatizer()"""
    line = line.lower()
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    line = del_letter_next_number(line)
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = ' '.join([token.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(line)])
    return line

In [12]:
def tf_idf():
    """Carrying out tf_idf method"""

    tfidf = TfidfVectorizer(
        stop_words='english',
        preprocessor=preprocessing
    )
    
    tfidf_data = (
        tfidf
        .fit_transform(post_data['text'])
        .toarray()
    )
    
    tfidf_data = pd.DataFrame(
        tfidf_data,
        index=post_data.post_id,
        columns=tfidf.get_feature_names_out()
    )
    
    return tfidf_data

In [13]:
tfidf_transform = tf_idf()

post_data['TotalTfIdf'] = tfidf_transform.sum(axis=1).reset_index()[0]
post_data['MaxTfIdf'] = tfidf_transform.max(axis=1).reset_index()[0]
post_data['MeanTfIdf'] = tfidf_transform.mean(axis=1).reset_index()[0]

post_data.head()

In [15]:
def post_clustering(n_components, n_cluster):
    """At first, carrying out PCA with n_components, thereafter KMeans with n_cluster"""
    centered = tfidf_transform - tfidf_transform.mean()

    pca = PCA(n_components=n_components)
    pca_transform = pca.fit_transform(centered)


    kmeans = KMeans(n_clusters=n_cluster, random_state=0).fit(pca_transform)
    post_data['TextCluster'] = kmeans.labels_

    dists_cols = [f"DistanceTo{ith}thCluster" for ith in range(1, n_cluster+1)]

    return pd.DataFrame(
        data=kmeans.transform(pca_transform),
        columns=dists_cols
    )

In [16]:
post_data = pd.concat((post_data, post_clustering(n_components=20, n_cluster=12)), axis=1)

post_data.head()

## whole_data

In [17]:
"""delete data where an action is like because it is doubling"""
whole_data = whole_data[whole_data['action'] != 'like']

In [18]:
whole_data['target'].value_counts()

# Everything merges into whole_data

In [19]:
whole_data = pd.merge(
    whole_data,
    post_data,
    on='post_id',
    how='left'
)

whole_data = pd.merge(
    whole_data,
    user_data,
    on='user_id',
    how='left'
)

whole_data.head()

In [20]:
whole_data['hour'] = whole_data['timestamp'].apply(lambda x: x.hour)
whole_data['dayofweek'] = whole_data['timestamp'].apply(lambda x: x.weekday())
whole_data['month'] = whole_data['timestamp'].apply(lambda x: x.month)

whole_data = whole_data.drop(['action','text'], axis=1)

whole_data = whole_data.set_index(['user_id', 'post_id'])

whole_data.head()

# Train-test split

In [21]:
min(whole_data.timestamp), max(whole_data.timestamp)

In [22]:
train = whole_data[whole_data.timestamp < '2021-12-16']
test = whole_data[whole_data.timestamp >= '2021-12-16']

whole_data = whole_data.drop('timestamp', axis=1)
train = train.drop('timestamp', axis=1)
test = test.drop('timestamp', axis=1)

X_train = train.drop('target', axis=1)
X_test = test.drop('target', axis=1)

y_train = train['target']
y_test = test['target']

# Realization of ColumnTransfomer

In [23]:
def data_processing(whole_data, categorical_cols):
    one_hot_cols = [col for col in categorical_cols if whole_data[col].nunique() <= 5]
    target_cols = [col for col in categorical_cols if whole_data[col].nunique() > 5]
    
    one_hot_cols_inds = [list(X_train.columns).index(col) for col in one_hot_cols]
    target_cols_inds = [list(X_train.columns).index(col) for col in target_cols]
    
    t = [
        ("OneHotEncoder", OneHotEncoder(), one_hot_cols_inds),
        ("TargetEncoder", TargetEncoder(), target_cols_inds)
    ]
    
    return ColumnTransformer(transformers=t)

In [24]:
categorical_cols = ['gender', 'TextCluster', 'country', 'city', 'exp_group', 'os', 'source', 'topic', 'hour', 'dayofweek', 'month']
transformer = data_processing(whole_data, categorical_cols)

# Training

In [25]:
def train(name, model, param_grid, categorical_cols=None):
    search = GridSearchCV(model, param_grid, scoring='roc_auc')
    
    if categorical_cols:
        search.fit(X_train, y_train, cat_features=categorical_cols)
    else:
        search.fit(X_train, y_train)

    best_model = search.best_estimator_

    predict_prob_train = best_model.predict_proba(X_train)[:, 1]
    predict_prob_test = best_model.predict_proba(X_test)[:, 1]

    print(f"---{name}---")
    print(f"roc-auc score on train data: {roc_auc_score(y_train, predict_prob_train)}")
    print(f"roc-auc score on test data: {roc_auc_score(y_test, predict_prob_test)}")
    return best_model

### Models

In [26]:
logreg = Pipeline([("ColumnTransformer", transformer),
                      ("LR", LogisticRegression(random_state=21, class_weight='balanced'))])
param_grid_1 = {
    "LR__C": [0.01, 0.1, 1, 5, 10],
}

catboost = CatBoostClassifier(random_state=21, verbose=0)

param_grid_2 = {
    'iterations': [100, 150],
    'depth': [2, 4],
    'l2_leaf_reg': [5, 10],
}

param_grid = [param_grid_1, param_grid_2]
models = [logreg, catboost]
name = ['LogisticRegression', 'CatBoostClassifier']
best_models = []

for i in range(len(models)):
    if i == 0:
        best_model = train(name[0], models[0], param_grid[0])
        print()
    else:
        best_model = train(name[1], models[1], param_grid[1], categorical_cols)
    best_models.append(best_model)

# Save two models for A/B testing

In [30]:
pickle.dump(best_models[0], open('model_control_1.pkl', 'wb'))
pickle.dump(best_models[1], open('model_test_1.pkl', 'wb'))

# Download post_data in base

In [32]:
post_data.to_sql(    
   "post_processed_features",                    
    con=CONNECTION,
    schema="public", 
    if_exists='replace',
    index=False
   )      

In [33]:
test_ = pd.read_sql(
    """SELECT * FROM public.post_processed_features""",
    
    con=CONNECTION
)

test_