# Import libraries

In [7]:
import os
import sys
sys.path.append('../../')
import config
import pandas as pd
import pickle
import numpy as np

import text_preprocessing as tp
from openai_api import get_embedding

from tqdm.notebook import tqdm
tqdm.pandas()

# Load data

In [2]:
train_df = pd.read_parquet(config.DATASET_TWEETS_TRAIN_FILE)
test_df = pd.read_parquet(config.DATASET_TWEETS_TEST_FILE)

# Prepare data

### Clean data

In [12]:
train_df['text_preprocessed'] = train_df['text'].apply(lambda text: tp.text_preprocessing(text))
test_df['text_preprocessed'] = test_df['text'].apply(lambda text: tp.text_preprocessing(text))

### Get texts embeddings

In [11]:
train_embeddings_filename = os.path.join(config.EMBEDDINGS_PATH, f'train_{len(train_df)}_words.csv')
if os.path.exists(train_embeddings_filename):
    embeddings_df = pd.read_csv(train_embeddings_filename)
    train_df['ada_embedding'] = embeddings_df["ada_embedding"].apply(eval).apply(np.array)
else:
    train_df['ada_embedding'] = train_df['text_preprocessed'].progress_apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    train_df.to_csv(train_embeddings_filename, index=False)
    


test_embeddings_filename = os.path.join(config.EMBEDDINGS_PATH, f'test_{len(test_df)}_words.csv')
if os.path.exists(test_embeddings_filename):
    embeddings_df = pd.read_csv(test_embeddings_filename)
    test_df['ada_embedding'] = embeddings_df["ada_embedding"].apply(eval).apply(np.array)
else:
    test_df['ada_embedding'] = test_df['text_preprocessed'].progress_apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    test_df.to_csv(test_embeddings_filename, index=False)

### Prepare model input

In [14]:
x_train = train_df['ada_embedding'].tolist()
y_train = train_df['label'].tolist()

x_test = test_df['ada_embedding'].tolist()
y_test = test_df['label'].tolist()

# Train models

### sklearn libraries

In [19]:
# Metrics calculation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

### Random Forest

Train model

In [21]:
from sklearn.ensemble import RandomForestClassifier

rfr = RandomForestClassifier(n_estimators=100)
rfr.fit(x_train, y_train)
preds = rfr.predict(x_test)

Get model results

In [23]:
results = confusion_matrix(y_test, preds) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(y_test, preds)) 
print('Report : ')
print(classification_report(y_test, preds)) 

Confusion Matrix :
[[40 25]
 [31 90]]
Accuracy Score : 0.6989247311827957
Report : 
              precision    recall  f1-score   support

           0       0.56      0.62      0.59        65
           1       0.78      0.74      0.76       121

    accuracy                           0.70       186
   macro avg       0.67      0.68      0.68       186
weighted avg       0.71      0.70      0.70       186



# Old version classification

In [4]:
train_text = train_df['text_preprocessed'].tolist()
train_labels = train_df['label'].tolist()

test_text = test_df['text_preprocessed'].tolist()
test_labels = test_df['label'].tolist()

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 


In [9]:
# getting features
vectorizer = HashingVectorizer(n_features=20)
vectorizer_name = os.path.join(config.DATA_PATH_MODELS, 'v1_vectorizer.pk')
with open(vectorizer_name, 'wb') as file:
    pickle.dump(vectorizer, file)

train_features = vectorizer.fit_transform(train_text).toarray()
print(train_features.shape)

test_features = vectorizer.fit_transform(test_text).toarray()
print(test_features.shape)

(467, 20)
(186, 20)


In [10]:
# OneClassSVM algorithm
clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)

# fit OneClassSVM model 
clf.fit(train_features, train_labels)

model_name = os.path.join(config.DATA_PATH_MODELS, 'v1_model.pkl')
with open(model_name, 'wb') as file:
    pickle.dump(clf, file)

In [11]:
# validate OneClassSVM model with train set
preds_train = clf.predict(train_features)

print("accuracy:", accuracy_score(train_labels, preds_train))

accuracy: 0.37687366167023556


In [12]:
# validate OneClassSVM model with test set
preds_test = clf.predict(test_features)
preds_test

array([ 1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1, -1,  1,  1,
        1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1, -1,  1,  1,  1,  1, -1,
        1,  1,  1, -1,  1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,
       -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1,  1,  1,  1,
       -1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1,  1,  1, -1,  1,  1,  1,
       -1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1,
       -1,  1, -1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,
        1, -1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1, -1, -1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,
        1,  1, -1, -1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1],
      dtype=int64)

In [13]:
results = confusion_matrix(test_labels, preds_test) 
print('Confusion Matrix :')
print(results) 
print('Accuracy Score :',accuracy_score(test_labels, preds_test)) 
print('Report : ')
print(classification_report(test_labels, preds_test)) 

Confusion Matrix :
[[ 0  0  0]
 [28  0 37]
 [41  0 80]]
Accuracy Score : 0.43010752688172044
Report : 
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00        65
           1       0.68      0.66      0.67       121

    accuracy                           0.43       186
   macro avg       0.23      0.22      0.22       186
weighted avg       0.44      0.43      0.44       186



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
from predict import predict_text

predict_text('Você não é uma vadia')

-1