In [1]:
from birdwatcher.config import DATA_PREP_CONFIG, PATHS, TRAINING_CONFIG
from birdwatcher.ml.evaluation import get_model_performance_metrics
from birdwatcher.ml.inference import get_trained_inference_pipeline
from birdwatcher.ml.training import _get_train_test_data, train_save_inference_pipeline

In [3]:
from birdwatcher.ml.inference import (
    _generate_inference_dataprep_kwargs,
    _generate_inference_tfidf_kwargs, 
    get_trained_inference_pipeline
)


inference_pipeline = get_trained_inference_pipeline(data_key="covid", save=False)

2023-06-11 00:51:04,850:INFO:inference: Getting dataprep and feature generation pipelines.
2023-06-11 00:51:04,852:INFO:inference: Loading trained pca from /home/ubuntu/twitter-sentiment-analysis/src/birdwatcher/ml/trained_pca.pkl.
2023-06-11 00:51:05,204:INFO:inference: Loading trained model from /home/ubuntu/twitter-sentiment-analysis/src/birdwatcher/ml/trained_model.pkl.
2023-06-11 00:51:05,205:INFO:inference: Combining dataprep, feature generation, pca and model into an inference pipeline.


In [6]:
inference_pipeline.named_steps

{'dataprep': Pipeline(steps=[('dataprep',
                  FunctionTransformer(func=<function process_text_df at 0x7f932491af70>,
                                      kw_args={'data_key': 'covid',
                                               'text_col': 'text'}))]),
 'feature_generation': Pipeline(steps=[('tfidf',
                  FunctionTransformer(func=<function generate_tfidf_df at 0x7f9324924430>,
                                      kw_args={'data_key': 'covid',
                                               'save': False,
                                               'text_col_proc': 'text_processed'}))]),
 'pca': PCAPlotIt(),
 'classifier': LogisticRegression(l1_ratio=0.05, n_jobs=-1, penalty='elasticnet',
                    random_state=123, solver='saga')}

In [4]:
import pandas as pd


from birdwatcher.dataprep.prep import _generate_raw_proc_path


raw_path = _generate_raw_proc_path(
    config_dict=DATA_PREP_CONFIG.processing_info["covid"],
    df_type="raw"
)
covid_raw = pd.read_parquet(raw_path)

In [5]:
covid_preds = inference_pipeline.predict(covid_raw)
covid_preds[0:10]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    0.4s finished
2023-06-11 00:51:15,121:INFO:prep: Raw text has been processed for data_key: 'covid'.
2023-06-11 00:51:15,122:INFO:feature_generation:Found vectorizer. Loading from /home/ubuntu/twitter-sentiment-analysis/src/birdwatcher/ml/tfidf_vectorizer.pkl.
2023-06-11 00:51:15,125:INFO:feature_generation:Transforming df_proc using fitted Tfidfvectorizer.


array([0, 1, 1, 0, 1, 1, 0, 1, 1, 1])

In [14]:
import pickle
from sklearn.pipeline import Pipeline



from birdwatcher.ml.inference import (
    _generate_inference_dataprep_kwargs,
    _generate_inference_tfidf_kwargs
)
from birdwatcher.dataprep.prep import get_dataprep_pipeline
from birdwatcher.ml.feature_generation import get_feature_generation_pipeline


with open(PATHS.dc_run_date_info, "rb") as infile:
    run_info = pickle.load(infile)
end_date_name = run_info.end_date_name
data_key = "covid"
save = False

INFERENCE_DATAPREP_KWARGS = _generate_inference_dataprep_kwargs(
    data_key=data_key,
    end_date_name=end_date_name,
    save=save
)
INFERENCE_TFIDF_KWARGS = _generate_inference_tfidf_kwargs(
    data_key=data_key,
    save=save
)
dataprep_pipeline = get_dataprep_pipeline(
    dataprep_kwargs=INFERENCE_DATAPREP_KWARGS
)
feature_generation_pipeline = get_feature_generation_pipeline(
    tfidf_kwargs=INFERENCE_TFIDF_KWARGS
)


test_pipe = Pipeline(steps=[
    ("dataprep", dataprep_pipeline),
    ("feature_generation", feature_generation_pipeline)
])

In [15]:
covid_tfidf = test_pipe.transform(covid_raw)
covid_tfidf.head()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    3.9s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    4.0s finished
2023-06-11 00:33:25,252:INFO:prep: Raw text has been processed for data_key: 'covid'.
2023-06-11 00:33:25,254:INFO:feature_generation:Found vectorizer. Loading from /home/ubuntu/twitter-sentiment-analysis/src/birdwatcher/ml/tfidf_vectorizer.pkl.
2023-06-11 00:33:25,258:INFO:feature_generation:Transforming df_proc using fitted Tfidfvectorizer.


Unnamed: 0,01,026,04,0430,07,08,0809,0815,09,10,...,zoya,zune,zunehd,zurich,zushi,zwolle,zzz,zzzzz,zzzzzzzzlullaby,zzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,01,026,04,0430,07,08,0809,0815,09,10,...,zoya,zune,zunehd,zurich,zushi,zwolle,zzz,zzzzz,zzzzzzzzlullaby,zzzzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
covid_tfidf.target.unique()

array([0.        , 0.28897039])

In [16]:
sentiment140_tfidf = pd.read_parquet("s3://twitter-sentiment-analysis-dev/cache/tfidf_data/sentiment140/Sentiment140_tfidf.parquet")
sentiment140_tfidf.head()

Unnamed: 0,target,01,026,04,0430,07,08,0809,0815,09,...,zoya,zune,zunehd,zurich,zushi,zwolle,zzz,zzzzz,zzzzzzzzlullaby,zzzzzzzzzzz
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
inference_pipeline = train_save_inference_pipeline(save=True, verbose=False)

2023-06-11 00:46:47,580:INFO:training: Defining pre-training pipeline.
2023-06-11 00:46:47,582:INFO:training: Loading raw sentiment140 dataset.
2023-06-11 00:46:48,101:INFO:training: Fitting pre-training pipeline and transforming raw dataset.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:   21.2s remaining:   21.2s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   21.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   21.2s finished
2023-06-11 00:47:09,353:INFO:prep: Raw text has been processed for data_key: 'sentiment140'.
2023-06-11 00:47:09,525:INFO:prep: Processed data has been saved for data_key: sentiment140.
2023-06-11 00:47:09,527:INFO:feature_generation:Found vectorizer. Loading from /home/ubuntu/twitter-sentiment-analysis/src/birdwatcher/ml/tfidf_vectorizer.pkl.
2023-06-11 00:47:09,531:INFO:feature_gen

In [4]:
pca = inference_pipeline["pca"]
x_train_pcs = pca.transform(x_train)
x_test_pcs = pca.transform(x_test)

In [5]:
pca.fit_status

True

In [7]:
from birdwatcher.ml.evaluation import get_cv_performance_metrics


get_cv_performance_metrics(
    clf=inference_pipeline["classifier"],
    x_train=x_train_pcs,
    y_train=y_train
)

2023-06-10 21:56:17,708:INFO:evaluation:--------------------COMPUTING CROSS-VALIDATED PERFORMANCE METRICS--------------------

2023-06-10 21:56:32,312:INFO:evaluation:Cross-validated precision: 0.7188593023175232
2023-06-10 21:56:45,973:INFO:evaluation:Cross-validated recall: 0.7428873706869681
2023-06-10 21:56:59,793:INFO:evaluation:Cross-validated f1_macro: 0.7277395508596813
2023-06-10 21:57:13,665:INFO:evaluation:Cross-validated roc_auc: 0.8033560000293962


In [4]:
inference_pipeline["classifier"].coef_

array([[-4.90596168, -0.84825256,  2.98537596, ...,  0.        ,
         0.        ,  0.        ]])

In [2]:
train_test = _get_train_test_data()
x_train = train_test.x_train
y_train = train_test.y_train
x_test = train_test.x_test
y_test = train_test.y_test

In [9]:
inference_pipeline = get_trained_inference_pipeline()

In [3]:
pca = inference_pipeline.named_steps["pca"]
trained_model = inference_pipeline.named_steps["classifier"]

In [7]:
get_model_performance_metrics(
    clf=inference_pipeline["classifier"],
    x_train=x_train_pcs,
    y_train=y_train,
    x_test=x_test_pcs,
    y_test=y_test
)

2023-06-10 22:12:28,375:INFO:evaluation:--------------------COMPUTING TEST SET PERFORMANCE METRICS--------------------

2023-06-10 22:12:28,377:INFO:evaluation:Accuracy: 0.8070929607737776
2023-06-10 22:12:28,380:INFO:evaluation:ROC AUC: 0.8779248935614802
2023-06-10 22:12:28,382:INFO:evaluation:Confusion matrix:
[[763 176]
 [183 739]]
2023-06-10 22:12:28,390:INFO:evaluation:
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       939
           1       0.81      0.80      0.80       922

    accuracy                           0.81      1861
   macro avg       0.81      0.81      0.81      1861
weighted avg       0.81      0.81      0.81      1861

2023-06-10 22:12:28,391:INFO:evaluation:--------------------COMPUTING CROSS-VALIDATED PERFORMANCE METRICS--------------------

2023-06-10 22:12:43,339:INFO:evaluation:Cross-validated precision: 0.7142258106705889
2023-06-10 22:12:57,588:INFO:evaluation:Cross-validated recall: 0.735701021867749

In [3]:
import pandas as pd


df_sentiment_raw = pd.read_parquet(
    DATA_PREP_CONFIG.processing_info["sentiment140"]["path_raw"]
)

In [19]:
df_sentiment_raw.iloc[:, 1:].head()

Unnamed: 0,id,date,flag,user,text
0,2000333855,Mon Jun 01 21:53:38 PDT 2009,NO_QUERY,PlumStSamplers,Sophie's party is so much fun with only 4 girl...
1,2054674369,Sat Jun 06 07:51:29 PDT 2009,NO_QUERY,SheaSoul,Here with Phlash. Flight's delayed- shopping t...
2,2031413186,Thu Jun 04 09:46:55 PDT 2009,NO_QUERY,daandewijs,"Democracy, its a beast but the best option we ..."
3,2002738455,Tue Jun 02 05:02:09 PDT 2009,NO_QUERY,johnwaire,@bevhollis SCHWEET! i'm very jealous
4,1835576806,Mon May 18 06:18:09 PDT 2009,NO_QUERY,happyseaurchin,@shixianjia @rosemary0 thankyou for your wishe...


In [4]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(
    df_sentiment_raw, 
    (df_sentiment_raw.iloc[:,0] / 4).astype(int), 
    test_size=TRAINING_CONFIG.test_size,
    random_state=TRAINING_CONFIG.random_state
)

In [5]:
y_test_preds = inference_pipeline.predict(x_test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    7.0s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.1s finished
2023-06-11 00:00:18,440:INFO:prep: Raw text has been processed for data_key: 'sentiment140'.
2023-06-11 00:00:18,550:INFO:prep: Processed data has been saved for data_key: sentiment140.
2023-06-11 00:00:18,551:INFO:feature_generation:Found vectorizer. Loading from /home/ubuntu/twitter-sentiment-analysis/src/birdwatcher/ml/tfidf_vectorizer.pkl.
2023-06-11 00:00:18,557:INFO:feature_generation:Transforming df_proc using fitted Tfidfvectorizer.
2023-06-11 00:00:20,389:INFO:feature_generation:Saving tfidf features to s3://twitter-sentiment-analysis-dev/cache/tfidf_data/sentiment140/Sentiment140_tfidf.parquet.


In [15]:
df_sentiment140_tfidf.loc[df_sentiment140_tfidf.index.isin(x_test.index), "target"]

3       1
4       1
6       1
7       0
10      0
       ..
1839    1
1841    0
1843    1
1849    1
1852    0
Name: target, Length: 384, dtype: int64

In [26]:
from birdwatcher.ml.feature_generation import _get_tfidf_save_path


tfidf_path = _get_tfidf_save_path(data_key="sentiment140")
df_sentiment140_tfidf = pd.read_parquet(tfidf_path)
y_test = df_sentiment140_tfidf.target

In [27]:
from sklearn.metrics import accuracy_score


accuracy_score(y_test, y_test_preds)

0.8072159396876682