In [28]:
FINAL_TRAIN = "../data/processed/train_final.pkl"
FINAL_TEST = "../data/processed/test_final.pkl"

FEATURES_DICT = "../data/objects/features.pkl"

In [26]:
# Load packages
import numpy as np
import pandas as pd

import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

___
## Functions:

In [30]:
def extract_features(tweet, freqs=features):
    words = process_tweet(tweet)
    tweet_array = np.array([[1.0, 0.0, 0.0]])
    
    for word in words:
        tweet_array[0, 1] += freqs.get((word, 1.0), 0)
        tweet_array[0, 2] += freqs.get((word, 0.0), 0)
    
    assert(tweet_array.shape == (1, 3))
    return tweet_array#.reshape(3,)

In [18]:
def calculate_quality(ground_truth, predictions, metric_function, model_name):
    """
    Calculate the quality of the model according to different metric scores
    Input:
        ground_truth: from real observed data
        predictions: the predicted values from the model
        metric_function: the metric score funcrion used to measure performance
    Output:
        A dict of all scores for the given inputs
    """
    quality_score = {}
    quality_score[model_name] = round(metric_function(ground_truth, predictions), 3)

    quality_score = pd.Series(quality_score.values(), index=quality_score.keys())
    
    return quality_score

In [23]:
def evaluate_model(model, data_list, metrics_list, model_name):
    """
    Get the scores of the model to better understand its performance
    Input:
        data_list: a list of all data that we evaluate model upon, train and test data.
                typical input : [[X_train, y_train, 'train'], [X_test, y_test, 'test']]
        scores_list: a list of all metrics used in the evaluation. 
                typical input : [accuracy_score, precision_score, recall_score, f1_score]
        model_name: a string input used as the index for score dataframe.
    Output:
        scores: a dataframe of evaluation based on data.
        general_error: the generalized error that would be used for logging in mlflow
    """
    predicts = []
    scores = []
    for [X,y,stage] in data_list:

        probas = model.predict(X)
        predictions = pd.DataFrame(probas)
        predicts.append(predictions)

        result = {score.__name__:calculate_quality(y, predictions, score, f"{model_name}_{stage}")
                        for score in metrics_list}

        result = pd.concat(result, axis=1)
        scores.append(result)
    scores = pd.concat(scores)
    return scores, predicts

In [None]:
def classify_tweet(tweet, clf):
    tweet = process_tweet(tweet)
    features = extract_

___
## Read data

In [29]:
with open(FEATURES_DICT, 'rb') as handle:
    features = pickle.load(handle)

In [6]:
train_df = pd.read_pickle(FINAL_TRAIN)
test_df = pd.read_pickle(FINAL_TEST)

In [7]:
train_df.sample(5)

Unnamed: 0,bias,pos,neg,sentiment
340,1.0,4603.0,697.0,1.0
629,1.0,2954.0,94.0,1.0
7285,1.0,223.0,7616.0,0.0
4736,1.0,68.0,3781.0,0.0
547,1.0,4123.0,599.0,1.0


In [11]:
X_train = train_df.drop(['sentiment'], axis=1)
X_test = test_df.drop(['sentiment'], axis=1)

y_train = train_df['sentiment']
y_test = test_df['sentiment']

___
## Building model:

In [12]:
clf = make_pipeline(MinMaxScaler(), LogisticRegression())

In [14]:
clf.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('logisticregression', LogisticRegression())])

In [21]:
# Prepare data and metrics that would be used to evaluate the model
data_list = [[X_train, y_train, 'train'], [X_test, y_test, 'test']]
metrics_list = [accuracy_score, precision_score, recall_score, f1_score] 

In [24]:
# Use evaluate_model function to run evaluations
scores, predicts = evaluate_model(clf, data_list, metrics_list, "LogisticRegression")
scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression_train,0.943,0.897,1.0,0.946
LogisticRegression_test,0.941,0.894,1.0,0.944
