<a href="https://colab.research.google.com/github/DaryaTereshchenko/ExperimentsUkr/blob/main/TF_IDF_RF%26LG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import regex as re
import pandas as pd
import numpy as np

In [None]:
def plot(y_true, y_pred, labels=None, title="", cmap=plt.cm.Blues):
    con_mat_df = confusion_matrix(y_true, y_pred)
    con_mat_df = con_mat_df.astype('float') / con_mat_df.sum(axis=1)[:, np.newaxis]
    disp = ConfusionMatrixDisplay(confusion_matrix=con_mat_df, display_labels=labels)
    disp.plot(cmap=cmap)
    plt.title(title)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/AnnotatedFull.csv")
clean = df["tweet"].apply(lambda x: re.sub(r'[^\w\s]', '', x).lower())

In [None]:
vec = TfidfVectorizer()
tf_idf = vec.fit_transform(clean)

In [None]:
X = tf_idf
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                   random_state=42, 
                                   test_size=0.3, 
                                   shuffle=True)

In [None]:
def logistic_classif(X_train, y_train, X_test, y_test, c_value=1.0):
    model = LogisticRegression(C=c_value, solver="lbfgs").fit(X_train, y_train)
    score = model.predict(X_test)
    return score

In [None]:
y_pred = logistic_classif(X_train, y_train, X_test, y_test)

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))

In [None]:
plot(y_test, y_pred, labels=["neutral", "offensive"], title="TF-IDF Linear Regression")

In [None]:
param_grid_ = {'C': [1e-5, 1e-3, 1e-1, 1e0, 1e1, 1e2]}
tfidf_search = GridSearchCV(LogisticRegression(), cv=3,
                                   param_grid=param_grid_)

In [None]:
tfidf_search.fit(X_train, y_train)

In [None]:
tfidf_search.best_score_

In [None]:
tfidf_search.best_params_

In [None]:
tfidf_search.cv_results_

In [None]:
search_results = pd.DataFrame.from_dict({'tfidf': tfidf_search.cv_results_['mean_test_score']})
search_results

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# creating a RF classifier 
clf = RandomForestClassifier(n_estimators = 200, n_jobs=-1, random_state=42, max_features = 'auto', max_depth = 100)

# Training the model on the training dataset
# fir function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

#performing predictions on the test dataset
y_pred = clf.predict(X_test)
print(clf.score(X_test, y_test))

In [None]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))

In [None]:
plot(y_test, y_pred, labels=["neutral", "offensive"], title="TF-IDF RandomForest")