# **Logistic regression**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


df = pd.read_csv('https://raw.githubusercontent.com/EvgeniaViskovatykh/BERT-fine-tune-Humor-Detection/main/dataset.csv')
df = df.rename(columns={'text': 'instruction', 'humor': 'response'})
df['response'] = df['response'].astype(int)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['instruction'], df['response'], test_size=0.2, random_state=42)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
log_preds = log_model.predict(X_test_tfidf)
print("Logistic Regression Results:")
print(classification_report(y_test, log_preds, target_names=["Not Humor", "Humor"]))


Logistic Regression Results:
              precision    recall  f1-score   support

   Not Humor       0.92      0.92      0.92     20001
       Humor       0.92      0.92      0.92     19999

    accuracy                           0.92     40000
   macro avg       0.92      0.92      0.92     40000
weighted avg       0.92      0.92      0.92     40000



# **Naive Baies**

In [4]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
nb_preds = nb_model.predict(X_test_tfidf)
print("Naive Bayes Results:")
print(classification_report(y_test, nb_preds, target_names=["Not Humor", "Humor"]))


Naive Bayes Results:
              precision    recall  f1-score   support

   Not Humor       0.92      0.89      0.90     20001
       Humor       0.89      0.92      0.91     19999

    accuracy                           0.90     40000
   macro avg       0.90      0.90      0.90     40000
weighted avg       0.90      0.90      0.90     40000



# **Random Forest**

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['instruction'], df['response'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)
preds = rf.predict(X_test_tfidf)

print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.92      0.89      0.91     20001
           1       0.90      0.92      0.91     19999

    accuracy                           0.91     40000
   macro avg       0.91      0.91      0.91     40000
weighted avg       0.91      0.91      0.91     40000

