In [2]:
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import scipy.stats as stats




In [3]:


# eplanation: This code loads a dataset of tweets and preprocesses it by removing neutral sentiments and unused columns. 
# The 'sentiment' column is then mapped to 0 (negative) or 1 (positive). The dataset is converted into a list and then back into a DataFrame.
# Next, the code sets up 5-fold cross-validation using the KFold function from scikit-learn. For each fold, the dataset is split into training and validation sets. A pre-trained DistilBERT model fine-tuned for sentiment analysis is defined using the ClassificationModel class from the Simple Transformers library.
#The model is trained on the training set and evaluated on the validation set. The accuracy of the model on the validation set is stored in the 'results' list. Finally, the mean accuracy across all folds is calculated and printed.
# Load the dataset

train_data = pd.read_csv("/content/sample_data/Tweets.csv") 

train_data = train_data[train_data['sentiment'] != 'neutral']
# Drop unused columns
train_data = train_data.drop(["textID", "selected_text"], axis=1)
# Map sentiments to 0/1
train_data["sentiment"] = train_data["sentiment"].map({"positive": 1, "negative": 0})

# Convert train_data to list
dataset = train_data.values.tolist()

# Specify output directories to overcome output directory already exists error
output_files=["1", "2", "3", "4", "5"]
test_output_files=["t1", "t2", "t3", "t4", "t5"]
# Turn dataset into dataframe
train_data = pd.DataFrame(dataset)

# Count for what directory to use
count = 0
# Display the updated DataFrame
print(train_data.head())

# prepare cross validation
n=5
kf = KFold(n_splits=n, random_state=42, shuffle=True)

results = []

for train_index, val_index in kf.split(train_data):
    # splitting Dataframe (dataset not included)
    train_df = train_data.iloc[train_index]
    val_df = train_data.iloc[val_index]
    # Defining Model
    model = ClassificationModel('distilbert', "distilbert-base-uncased-finetuned-sst-2-english", )
    # train the model
    model.train_model(train_df, output_dir=output_files[count])
    # validate the model
    result, model_outputs, wrong_predictions = model.eval_model(val_df, acc=accuracy_score, output_dir=test_output_files[count])
    print(result['acc'])
    # append model score
    results.append(result['acc'])
    count += 1


print("results",results)
print(f"Mean-Precision: {sum(results) / len(results)}")



                                                   0  1
0      Sooo SAD I will miss you here in San Diego!!!  0
1                          my boss is bullying me...  0
2                     what interview! leave me alone  0
3   Sons of ****, why couldn`t they put them on t...  0
4  2am feedings for the baby are fun when he is a...  1


Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



  0%|          | 0/13090 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1637 [00:00<?, ?it/s]



  0%|          | 0/3273 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/410 [00:00<?, ?it/s]

0.897952948365414




  0%|          | 0/13090 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1637 [00:00<?, ?it/s]



  0%|          | 0/3273 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/410 [00:00<?, ?it/s]

0.8835930339138405




  0%|          | 0/13090 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1637 [00:00<?, ?it/s]



  0%|          | 0/3273 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/410 [00:00<?, ?it/s]

0.9086465016804155




  0%|          | 0/13091 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1637 [00:00<?, ?it/s]



  0%|          | 0/3272 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/409 [00:00<?, ?it/s]

0.9037286063569682




  0%|          | 0/13091 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1637 [00:00<?, ?it/s]



  0%|          | 0/3272 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/409 [00:00<?, ?it/s]

0.8896699266503667
results [0.897952948365414, 0.8835930339138405, 0.9086465016804155, 0.9037286063569682, 0.8896699266503667]
Mean-Precision: 0.8967182033934009


In [4]:
#BEGIN[ChatGPT][https://chat.openai.com/auth/login]

# I copy and pasted the ChatGPT to make a skeleton code and then I revised the code which did not work or weird

# command: I would like to set up a null hypothesis and test the accuracy of the sentiment analysis model I created. Could you please provide me with the code


# explaination: This code performs a t-test to verify if the sentiment analysis model's accuracy is statistically significantly different from random classification, 
# using accuracies obtained from 5-fold cross-validation. It calculates the mean and standard deviation of accuracies, as well as the t-statistic and degrees of freedom. 
# The critical value is found, and the null hypothesis is tested. The result indicates whether the model's accuracy is significantly different from random classification or not.

# Accuracies obtained from 5-fold cross-validation
model_accuracies = np.array(results)

# Random classification accuracy (e.g., 33.3%) if we repeated Random classification  more and more , the random_accuracy = 0.03 
random_accuracy = 0.333

# Calculate the mean and standard deviation
mean_accuracy = np.mean(model_accuracies)
std_dev_accuracy = np.std(model_accuracies)

# Calculate t-statistic and degrees of freedom
t_statistic = (mean_accuracy - random_accuracy) / (std_dev_accuracy / np.sqrt(len(model_accuracies)))
degrees_of_freedom = len(model_accuracies) - 1

# Set significance level (e.g., 0.05)
alpha = 0.05

# Find the critical value
critical_value = stats.t.ppf(1 - alpha, degrees_of_freedom)

# Null hypothesis (H0): There is no significant difference between the sentiment analysis model's accuracy and random classification.
# Alternative hypothesis (H1): The sentiment analysis model's accuracy is significantly different from random classification.

# Check whether the t-statistic is larger than the critical value
if t_statistic > critical_value:
    print("Reject the null hypothesis. The sentiment analysis model's accuracy is statistically significantly higher than random classification.")
else:
    print("Failed to reject the null hypothesis. The sentiment analysis model's accuracy is not statistically significantly different from random classification.")

Reject the null hypothesis. The sentiment analysis model's accuracy is statistically significantly higher than random classification.
