# **Representation**

In [None]:
!pip install datasets


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [4]:
from datasets import load_from_disk

# Load the dataset from Google Drive
load_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Data/new_ds'
loaded_ds = load_from_disk(load_path)

# View the loaded dataset dict object
loaded_ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'cleaned_text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text', 'cleaned_text'],
        num_rows: 50000
    })
})

In [5]:
# Extract train and test datasets
train_df = loaded_ds['train'].to_pandas()
test_df = loaded_ds['test'].to_pandas()

In [None]:
train_df.head().isnull().sum()

label           0
text            0
cleaned_text    0
dtype: int64

## TF-IDF

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Fill None values in the 'cleaned_text' column with an empty string
train_df['cleaned_text'] = train_df['cleaned_text'].fillna('')
test_df['cleaned_text'] = test_df['cleaned_text'].fillna('')

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_df['cleaned_text'])

# Transform the train and test datasets
train_tfidf = tfidf_vectorizer.transform(train_df['cleaned_text'])
test_tfidf = tfidf_vectorizer.transform(test_df['cleaned_text'])

# Print vocabulary and IDF values for inspection
print('TF-IDF Vocabulary:', tfidf_vectorizer.vocabulary_)
print('TF-IDF IDF Values:', tfidf_vectorizer.idf_)
vector = tfidf_vectorizer.transform([train_df['cleaned_text'][0]])
print('Vector:', vector.toarray())


Output hidden; open in https://colab.research.google.com to view.

In [46]:
#Save the Vectorizer and Transformed Data
import joblib

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/tfidf_vectorizer.joblib')

# Save the transformed datasets
joblib.dump(train_tfidf, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/train_tfidf.joblib')
joblib.dump(test_tfidf, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/test_tfidf.joblib')

# Save the train and test dataframes (optional)
train_df.to_csv('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/tfidf_train_df.csv', index=False)
test_df.to_csv('/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/tfidf_test_df.csv', index=False)


## BoW

In [None]:
# Count Vectorizer (Bag of Words)
count_vectorizer = CountVectorizer()
count_vectorizer.fit(train_df['cleaned_text'])

# Transform the train and test datasets
train_bow = count_vectorizer.transform(train_df['cleaned_text'])
test_bow = count_vectorizer.transform(test_df['cleaned_text'])

# Print vocabulary for inspection (optional)
print('Count Vectorizer Vocabulary:', count_vectorizer.vocabulary_)

# Example of inspecting the transformed vectors
print('Sample TF-IDF Vector:', train_tfidf[0].toarray())
print('Sample BoW Vector:', train_bow[0].toarray())


Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Number of documents in the train and test datasets
num_train_docs = train_tfidf.shape[0]
num_test_docs = test_tfidf.shape[0]
total_docs = num_train_docs + num_test_docs

# Total words in the vocabulary for TF-IDF
tfidf_vocab_size = len(tfidf_vectorizer.vocabulary_)

# Total words in the vocabulary for BoW
bow_vocab_size = len(count_vectorizer.vocabulary_)

# Print the results
print(f"Number of documents in the train dataset: {num_train_docs}")
print(f"Number of documents in the test dataset: {num_test_docs}")
print(f"Total number of documents: {total_docs}")
print(f"Total words in the TF-IDF vocabulary: {tfidf_vocab_size}")
print(f"Total words in the BoW vocabulary: {bow_vocab_size}")


Number of documents in the train dataset: 650000
Number of documents in the test dataset: 50000
Total number of documents: 700000
Total words in the TF-IDF vocabulary: 291159
Total words in the BoW vocabulary: 291159


# **Models**

## Logistic Regression Model with TF-IDF

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Labels
y_train = train_df['label']
y_test = test_df['label']

# Pipeline to scale data and apply logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('log_reg', LogisticRegression(max_iter=3000, solver='liblinear'))
])

# Train the model
pipeline.fit(train_tfidf, y_train)

# Make predictions
y_pred = pipeline.predict(test_tfidf)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred)
report_lr = classification_report(y_test, y_pred)

# Print results
print(f'Accuracy: {accuracy_lr}')
print(f'Classification Report:\n{report_lr}')


Accuracy: 0.39218
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.55      0.52     10000
           1       0.35      0.30      0.32     10000
           2       0.31      0.28      0.29     10000
           3       0.34      0.32      0.33     10000
           4       0.43      0.50      0.46     10000

    accuracy                           0.39     50000
   macro avg       0.39      0.39      0.39     50000
weighted avg       0.39      0.39      0.39     50000



In [38]:
import joblib

# Save the model pipeline to a file
joblib.dump(pipeline, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/logistic_regression_pipeline.pkl')


['/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/logistic_regression_pipeline.pkl']

In [39]:
# Define the file path for saving results
results_file_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/lr_model_evaluation_results.txt'

# Save the accuracy and classification report to a text file
with open(results_file_path, 'w') as file:
    file.write(f'Accuracy: {accuracy_lr}\n')
    file.write(f'Classification Report:\n{report_lr}')


In [None]:
# # Read the Logistic Regression evaluation results from the file
# with open(results_file_path, 'r') as file:
#     results = file.read()

# print(results)


## Random Forest Model with TF-IDF


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Sample a subset of the training data
sampled_train_df = train_df.sample(frac=0.25, random_state=42)  # Use 10% of the data
sampled_train_tfidf = tfidf_vectorizer.transform(sampled_train_df['cleaned_text'])
y_train_sampled = sampled_train_df['label']

# Instantiate the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model on the subset
rf_model.fit(sampled_train_tfidf, y_train_sampled)

# Make predictions
y_pred = rf_model.predict(test_tfidf)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred)
report_rf = classification_report(y_test, y_pred)

# Print results
print(f'Accuracy: {accuracy_rf}')
print(f'Classification Report:\n{report_rf}')



Accuracy: 0.38278
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.62      0.54     10000
           1       0.34      0.27      0.30     10000
           2       0.31      0.28      0.29     10000
           3       0.32      0.30      0.31     10000
           4       0.42      0.46      0.44     10000

    accuracy                           0.38     50000
   macro avg       0.37      0.38      0.37     50000
weighted avg       0.37      0.38      0.37     50000



In [40]:
import joblib

# Save the Random Forest model to a file
joblib.dump(rf_model, '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/random_forest_model.pkl')


['/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/random_forest_model.pkl']

In [41]:
# Define the file path for saving results
results_file_path = '/content/drive/My Drive/Colab Notebooks/LLM Project GoogleColab/Models Results/random_forest_evaluation_results.txt'

# Save the accuracy and classification report to a text file
with open(results_file_path, 'w') as file:
    file.write(f'Accuracy: {accuracy_rf}\n')
    file.write(f'Classification Report:\n{report_rf}')


In [None]:
# # Read the evaluation results from the file
# with open(results_file_path, 'r') as file:
#     results = file.read()

# print(results)
