<a href="https://colab.research.google.com/github/Amanollahi/Pat/blob/main/Patra_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Traditional *ML*



## Imports

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import RandomizedSearchCV

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:

url = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"
df = pd.read_csv(url)

# Check the first few rows
print(df.head())

# Check the shape of your dataset
print("\nDataset shape:", df.shape)

                                              review  label
0  After 6 years of faithful service, my hard dri...      0
1  So far so good - this is a comprehensive and p...      0
2  This has been a great and easy software to use...      0
3  This router is great. The setup and installati...      0
4  Overview\n\nThis is a great array for someone ...      0

Dataset shape: (3825, 2)


## Logistic Regression

In [15]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer


def random_search_hyperparameters(model, param_dist, X_train, y_train, n_iter=50):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        scoring='f1_weighted',
        n_iter=n_iter,
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    print("Best Parameters:", random_search.best_params_)
    print("Best F1 Score:", random_search.best_score_)
    return random_search.best_estimator_



def train_Logis_model(X_train, y_train):
    """Train a Logistic Regression model."""
    # Added class_weight='balanced' to handle class imbalance
    model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess the dataset
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering using TF-IDF vectorization
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)

    # Logistic Regression Hyperparameter Tuning
    param_dist = {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['liblinear', 'saga']
    }
    print("\nPerforming RandomizedSearchCV for Logistic Regression...")
    best_model = random_search_hyperparameters(LogisticRegression(class_weight='balanced', max_iter=1000), param_dist, X_train_vec, y_train)

    # Evaluate the best model
    print("\nEvaluating the Best Logistic Regression Model...")
    evaluate_model(best_model, X_test_vec, y_test)


if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

Performing RandomizedSearchCV for Logistic Regression...
Fitting 5 folds for each of 30 candidates, totalling 150 fits




Best Parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.01}
Best F1 Score: 0.9658186139469562

Evaluating the Best Logistic Regression Model...

F1 Score: 0.9667915106117354

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       748
           1       0.00      0.00      0.00        17

    accuracy                           0.98       765
   macro avg       0.49      0.50      0.49       765
weighted avg       0.96      0.98      0.97       765



50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

## SVM

In [7]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.svm import SVC

def train_model(X_train, y_train):
    model = SVC(kernel='linear', class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_model(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9617220703439087

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.08      0.06      0.07        17

    accuracy                           0.96       765
   macro avg       0.53      0.52      0.53       765
weighted avg       0.96      0.96      0.96       765



## Random Forest

In [9]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.ensemble import RandomForestClassifier

def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_model(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9608747044917257

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.97       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.97      0.96       765



## XGBOOST

In [12]:
!pip3 install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [13]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from xgboost import XGBClassifier

def train_xgboost(X_train, y_train):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=10, random_state=42)
    model.fit(X_train, y_train)
    return model

from lightgbm import LGBMClassifier

def train_lightgbm(X_train, y_train):
    model = LGBMClassifier(class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    return model

from catboost import CatBoostClassifier

def train_catboost(X_train, y_train):
    model = CatBoostClassifier(verbose=0, random_state=42)
    model.fit(X_train, y_train)
    return model

from sklearn.naive_bayes import MultinomialNB

def train_naive_bayes(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_xgboost(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)


    model = train_lightgbm(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

    model = train_catboost(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

    model = train_naive_bayes(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.




F1 Score: 0.9595502293238645

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.96       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.96      0.96       765

[LightGBM] [Info] Number of positive: 70, number of negative: 2990




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38401
[LightGBM] [Info] Number of data points in the train set: 3060, number of used features: 1428
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





F1 Score: 0.9608747044917257

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.97       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.97      0.96       765


F1 Score: 0.9654827560850062

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       748
           1       0.00      0.00      0.00        17

    accuracy                           0.98       765
   macro avg       0.49      0.50      0.49       765
weighted avg       0.96      0.98      0.97       765


F1 Score: 0.9667915106117354

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       748
           1       0.00      0.00      0.00        17

    accuracy        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## grid search -- catboost and naive bayes -- plus SMOTE

In [20]:

from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.naive_bayes import MultinomialNB


# Step 1: Load and Explore Dataset
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

# Step 2: Preprocessing
def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

# Step 3: Feature Engineering
def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

# Step 4: Handle Class Imbalance
def handle_imbalance(X, y):
    """Handle class imbalance using SMOTE."""
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# Step 5: Model Training and Hyperparameter Tuning
def train_logistic_regression(X_train, y_train):
    """Train a Logistic Regression model."""
    model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

def grid_search_catboost(X_train, y_train):
    """Grid search for CatBoostClassifier with minimal parameter grid."""
    param_grid = {
        'depth': [6],               # Reduced from [4, 6, 8]
        'learning_rate': [0.1],  # Reduced from [0.01, 0.1, 0.2]
        'iterations': [500]       # Reduced from [100, 500, 1000]
    }
    model = CatBoostClassifier(verbose=0, random_state=42)
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1_weighted',
        cv=3,                          # Reduced from 5
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    print("\nBest Parameters for CatBoost:", grid.best_params_)
    print("Best F1 Score for CatBoost:", grid.best_score_)
    return grid.best_estimator_

def grid_search_naive_bayes(X_train, y_train):
    """Grid search for MultinomialNB with minimal parameter grid."""
    param_grid = {
        'alpha': [0.1, 1.0, 2.0]      # Reduced from [0.1, 0.5, 1.0, 2.0, 5.0]
    }
    model = MultinomialNB()
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1_weighted',
        cv=3,                          # Reduced from 5
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    print("\nBest Parameters for MultinomialNB:", grid.best_params_)
    print("Best F1 Score for MultinomialNB:", grid.best_score_)
    return grid.best_estimator_

def train_catboost(X_train, y_train):
    """Train a CatBoostClassifier."""
    model = CatBoostClassifier(verbose=0, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_naive_bayes(X_train, y_train):
    """Train a MultinomialNB model."""
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

# Step 6: Evaluation
def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

# Main Execution
def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)

    # Handle class imbalance
    X_train_resampled, y_train_resampled = handle_imbalance(X_train_vec, y_train)

    # CatBoost with Grid Search
    best_catboost = grid_search_catboost(X_train_resampled, y_train_resampled)
    evaluate_model(best_catboost, X_test_vec, y_test)

    # MultinomialNB with Grid Search
    best_naive_bayes = grid_search_naive_bayes(X_train_resampled, y_train_resampled)
    evaluate_model(best_naive_bayes, X_test_vec, y_test)

if __name__ == "__main__":
    main()


Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64


KeyboardInterrupt: 

# Ensemble models

In [14]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.ensemble import VotingClassifier

def train_ensemble(X_train, y_train):
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier

    model1 = LogisticRegression(class_weight='balanced', random_state=42)
    model2 = SVC(kernel='linear', class_weight='balanced', random_state=42, probability=True)
    model3 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

    ensemble = VotingClassifier(estimators=[
        ('lr', model1), ('svc', model2), ('rf', model3)
    ], voting='soft')

    ensemble.fit(X_train, y_train)
    return ensemble


def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_ensemble(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9608747044917257

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.97       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.97      0.96       765



# Bert


In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

# Load and preprocess data
def load_data(file_path):
    """Load dataset from a CSV file."""
    try:
        df = pd.read_csv(file_path)
        print("Dataset loaded successfully.")
        print("\nDataset Info:")
        print(df.info())
        print("\nClass Distribution:")
        print(df['label'].value_counts())
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def preprocess_text(text):
    """Clean and preprocess the text data."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    if 'review' not in df.columns:
        raise ValueError("The dataset must contain a 'review' column.")

    df['cleaned_text'] = df['review'].apply(preprocess_text)
    return df

# Custom Dataset class for BERT
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    f1 = f1_score(true_labels, predictions, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))
    return f1

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"
    df = load_data(file_path)

    # Preprocess dataset
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=2,
        problem_type="single_label_classification"
    )

    # Calculate class weights
    class_weights = torch.tensor(
        [1.0, (y_train == 0).sum() / (y_train == 1).sum()],
        dtype=torch.float
    ).to(device)
    model.config.class_weights = class_weights

    # Create datasets
    train_dataset = ReviewDataset(X_train.values, y_train.values, tokenizer)
    test_dataset = ReviewDataset(X_test.values, y_test.values, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=16,   # test for RAM
        shuffle=False
    )

    # Move model to device
    model.to(device)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Training loop
    epochs = 20
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_loss = train_epoch(model, train_loader, optimizer, device)
        print(f"Average training loss: {train_loss:.4f}")

        if epoch == epochs - 1:  # Evaluate on the last epoch
            print("\nEvaluating final model:")
            evaluate_model(model, test_loader, device)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Using device: cuda
Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/20
Average training loss: 0.1434

Epoch 2/20
Average training loss: 0.1080

Epoch 3/20
Average training loss: 0.0984

Epoch 4/20
Average training loss: 0.0656

Epoch 5/20
Average training loss: 0.0472

Epoch 6/20
Average training loss: 0.0270

Epoch 7/20
Average training loss: 0.0224

Epoch 8/20
Average training loss: 0.0274

Epoch 9/20
Average training loss: 0.0186

Epoch 10/20
Average training loss: 0.0126

Epoch 11/20
Average training loss: 0.0101

Epoch 12/20
Average training loss: 0.0096

Epoch 13/20
Average training loss: 0.0090

Epoch 14/20
Average training loss: 0.0082

Epoch 15/20
Average training loss: 0.0086

Epoch 16/20
Average training loss: 0.0082

Epoch 17/20
Average training loss: 0.0081

Epoch 18/20
Average training loss: 0.0078

Epoch 19/20
Average training loss: 0.0078

Epoch 20/20
Average training loss: 0.0076

Evaluating final model:

F1 Score: 0.9691161501813214

Classification Report:
              precision    recall  f1-score   support

           0   

# Bert and SMOTE

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE  # For oversampling
from torch.nn import CrossEntropyLoss

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

# Load and preprocess data
def load_data(file_path):
    """Load dataset from a CSV file."""
    try:
        df = pd.read_csv(file_path)
        print("Dataset loaded successfully.")
        print("\nDataset Info:")
        print(df.info())
        print("\nClass Distribution:")
        print(df['label'].value_counts())
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def preprocess_text(text):
    """Clean and preprocess the text data."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    if 'review' not in df.columns:
        raise ValueError("The dataset must contain a 'review' column.")

    df['cleaned_text'] = df['review'].apply(preprocess_text)
    return df

# Custom Dataset class for BERT
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_epoch(model, data_loader, optimizer, device, class_weights):
    model.train()
    total_loss = 0

    # Define loss function with class weights
    criterion = CrossEntropyLoss(weight=class_weights.to(device))

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Calculate loss with class weights
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    f1 = f1_score(true_labels, predictions, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(true_labels, predictions))
    return f1

from sklearn.feature_extraction.text import TfidfVectorizer

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"
    df = load_data(file_path)

    # Preprocess dataset
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Convert text to numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to avoid memory issues
    X_train_tfidf = vectorizer.fit_transform(X_train)

    # Apply SMOTE to oversample the minority class
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

    # Convert numerical features back to text (for BERT input)
    X_train_resampled_text = vectorizer.inverse_transform(X_train_resampled)
    X_train_resampled_text = [' '.join(text) for text in X_train_resampled_text]

    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=2,
        problem_type="single_label_classification"
    )

    # Calculate class weights
    class_weights = torch.tensor(
        [1.0, (y_train_resampled == 0).sum() / (y_train_resampled == 1).sum()],
        dtype=torch.float
    ).to(device)

    # Create datasets
    train_dataset = ReviewDataset(X_train_resampled_text, y_train_resampled, tokenizer)
    test_dataset = ReviewDataset(X_test, y_test, tokenizer)

    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        shuffle=True
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=16,
        shuffle=False
    )

    # Move model to device
    model.to(device)

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Training loop
    epochs = 3
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_loss = train_epoch(model, train_loader, optimizer, device, class_weights)
        print(f"Average training loss: {train_loss:.4f}")

        if epoch == epochs - 1:  # Evaluate on the last epoch
            print("\nEvaluating final model:")
            evaluate_model(model, test_loader, device)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using device: cpu
Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
