In [1]:
"""
import python packages
"""

from cubyc import Run
import numpy as np
import pandas as pd
import re
import os
import ast
from dotenv import load_dotenv
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [2]:
"""
get data
"""

load_dotenv()
words = ast.literal_eval(os.getenv('WORDS')) #import words from environment file

original_training_data = pd.read_csv('train.csv') #import train csv provided in data 100
train, val = train_test_split(original_training_data, test_size = 0.1, random_state = 100)
train.reset_index(inplace=True, drop=True)
val.reset_index(inplace=True, drop=True)

In [3]:
"""
feature functions

args:
    1. df - spam/ham dataframe
    2. additional feature specific arguments (e.g. words)
    
returns:
    1. pandas series/df of features
"""


def total_words(df): #number of each word in words in the email
    return pd.DataFrame(np.array([df['email'].str.count(word) for word in words]).T)


def length(df): #number of total words in the email
    return pd.Series(df['email'].apply(lambda x: len(re.split(r'\s|<', x))), name='length') 

features = [total_words, length]

In [4]:
"""
model creation
"""

def model_data(df):
    for feature in features:
        df = df.merge(feature(df), left_index=True, right_index=True)
    df.columns = df.columns.astype(str)
    return df.drop(columns=['id','subject','email','spam']), df['spam']

X_train, Y_train = model_data(train)
X_val, Y_val = model_data(val)

lr = LogisticRegression(fit_intercept=True, penalty='l2')
lr.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
"""
model evaluation
"""

train_predictions = lr.predict(X_train)
train_accuracy = np.mean(train_predictions == train["spam"])
print(train_accuracy)

val_predictions = lr.predict(X_val)
val_accuracy = np.mean(val_predictions == val["spam"])
print(val_accuracy)

0.7982164248635698
0.8395209580838323


In [6]:
"""
export to git (using cubyc)
"""

run = Run(remote="https://github.com/BoggieBoo/spam-ham", tags = [str(feature) for feature in features])
run.start()
run.log({"training accuracy": train_accuracy, "validation accuracy": val_accuracy, "words": words})
run.end()