<a href="https://colab.research.google.com/github/Amanollahi/Pat/blob/main/Patra_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:

url = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"
df = pd.read_csv(url)

# Check the first few rows
print(df.head())

# Check the shape of your dataset
print("\nDataset shape:", df.shape)

                                              review  label
0  After 6 years of faithful service, my hard dri...      0
1  So far so good - this is a comprehensive and p...      0
2  This has been a great and easy software to use...      0
3  This router is great. The setup and installati...      0
4  Overview\n\nThis is a great array for someone ...      0

Dataset shape: (3825, 2)


In [5]:
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nDataset Info:")
    print(df.info())
    print("\nClass Distribution:")
    print(df['label'].value_counts())
    return df

def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['review'].apply(preprocess_text)  # Fixed column name
    return df

def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

def train_model(X_train, y_train):
    """Train a Logistic Regression model."""
    # Added class_weight='balanced' to handle class imbalance
    model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("\nF1 Score:", f1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    return f1

def main():
    file_path = "https://raw.githubusercontent.com/Amanollahi/Pat/main/review_data.csv"

    # Load and preprocess
    df = load_data(file_path)
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['label'],
        test_size=0.2,
        random_state=42,
        stratify=df['label']
    )

    # Feature engineering and training
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)
    model = train_model(X_train_vec, y_train)

    # Evaluation
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3825 entries, 0 to 3824
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  3824 non-null   object
 1   label   3825 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 59.9+ KB
None

Class Distribution:
label
0    3738
1      87
Name: count, dtype: int64

F1 Score: 0.9600424696076871

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       748
           1       0.11      0.12      0.11        17

    accuracy                           0.96       765
   macro avg       0.55      0.55      0.55       765
weighted avg       0.96      0.96      0.96       765

