<a href="https://colab.research.google.com/github/Amanollahi/Pat/blob/main/Patra_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:

url = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"
df = pd.read_csv(url)

# Check the first few rows
print(df.head())

# Check the shape of your dataset
print("\nDataset shape:", df.shape)

                                              review  label
0  After 6 years of faithful service, my hard dri...      0
1  So far so good - this is a comprehensive and p...      0
2  This has been a great and easy software to use...      0
3  This router is great. The setup and installati...      0
4  Overview\n\nThis is a great array for someone ...      0

Dataset shape: (3825, 2)


# Logistic Regression

In [5]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.model_selection import RandomizedSearchCV

def random_search_hyperparameters(model, param_dist, X_train, y_train, n_iter=50):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        scoring='f1_weighted',
        n_iter=n_iter,
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    print("Best Parameters:", random_search.best_params_)
    print("Best F1 Score:", random_search.best_score_)
    return random_search.best_estimator_



def train_Logis_model(X_train, y_train):
    """Train a Logistic Regression model."""
    # Added class_weight='balanced' to handle class imbalance
    model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_Logis_model(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9600424696076871

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       748
           1       0.11      0.12      0.11        17

    accuracy                           0.96       765
   macro avg       0.55      0.55      0.55       765
weighted avg       0.96      0.96      0.96       765



# SVM

In [7]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.svm import SVC

def train_model(X_train, y_train):
    model = SVC(kernel='linear', class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_model(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9617220703439087

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.08      0.06      0.07        17

    accuracy                           0.96       765
   macro avg       0.53      0.52      0.53       765
weighted avg       0.96      0.96      0.96       765



# Random Forest

In [9]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.ensemble import RandomForestClassifier

def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_model(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9608747044917257

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.97       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.97      0.96       765



# XGBOOST

In [12]:
!pip3 install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [13]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from xgboost import XGBClassifier

def train_xgboost(X_train, y_train):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=10, random_state=42)
    model.fit(X_train, y_train)
    return model

from lightgbm import LGBMClassifier

def train_lightgbm(X_train, y_train):
    model = LGBMClassifier(class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    return model

from catboost import CatBoostClassifier

def train_catboost(X_train, y_train):
    model = CatBoostClassifier(verbose=0, random_state=42)
    model.fit(X_train, y_train)
    return model

from sklearn.naive_bayes import MultinomialNB

def train_naive_bayes(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_xgboost(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)


    model = train_lightgbm(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

    model = train_catboost(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

    model = train_naive_bayes(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.




F1 Score: 0.9595502293238645

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.96       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.96      0.96       765

[LightGBM] [Info] Number of positive: 70, number of negative: 2990




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38401
[LightGBM] [Info] Number of data points in the train set: 3060, number of used features: 1428
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





F1 Score: 0.9608747044917257

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.97       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.97      0.96       765


F1 Score: 0.9654827560850062

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       748
           1       0.00      0.00      0.00        17

    accuracy                           0.98       765
   macro avg       0.49      0.50      0.49       765
weighted avg       0.96      0.98      0.97       765


F1 Score: 0.9667915106117354

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       748
           1       0.00      0.00      0.00        17

    accuracy        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# ensamble models

In [14]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

from sklearn.ensemble import VotingClassifier

def train_ensemble(X_train, y_train):
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier

    model1 = LogisticRegression(class_weight='balanced', random_state=42)
    model2 = SVC(kernel='linear', class_weight='balanced', random_state=42, probability=True)
    model3 = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

    ensemble = VotingClassifier(estimators=[
        ('lr', model1), ('svc', model2), ('rf', model3)
    ], voting='soft')

    ensemble.fit(X_train, y_train)
    return ensemble


def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_ensemble(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9608747044917257

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       748
           1       0.00      0.00      0.00        17

    accuracy                           0.97       765
   macro avg       0.49      0.49      0.49       765
weighted avg       0.96      0.97      0.96       765

