<a href="https://colab.research.google.com/github/Amanollahi/Pat/blob/main/Patra_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Ensure necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Load and Explore Dataset
def load_data(file_path):
    """Load dataset from a CSV file."""
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("Dataset Info:")
    print(df.info())
    print("Class Distribution:")
    print(df['label'].value_counts())
    return df

# Step 2: Preprocessing
def preprocess_text(text):
    """Clean and preprocess the text data."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

def preprocess_dataset(df):
    """Apply preprocessing to the dataset."""
    df['cleaned_text'] = df['text'].apply(preprocess_text)
    return df

# Step 3: Feature Engineering
def vectorize_text(train_texts, test_texts):
    """Convert text data into numerical format using TF-IDF."""
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test, vectorizer

# Step 4: Model Training
def train_model(X_train, y_train):
    """Train a Logistic Regression model."""
    model = LogisticRegression()
    model.fit(X_train, y_train)
    return model

# Step 5: Evaluation
def evaluate_model(model, X_test, y_test):
    """Evaluate the model using F1 score and other metrics."""
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print("F1 Score:", f1)
    print("Classification Report:")
    print(classification_report(y_test))
    return f1

# Main Execution
def main():
    file_path = "customer_feedback.csv"  # Replace with the actual dataset path

    # Load dataset
    df = load_data(file_path)

    # Preprocess dataset
    df = preprocess_dataset(df)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
    )

    # Vectorize text
    X_train_vec, X_test_vec, vectorizer = vectorize_text(X_train, X_test)

    # Train model
    model = train_model(X_train_vec, y_train)

    # Evaluate model
    evaluate_model(model, X_test_vec, y_test)

if __name__ == "__main__":
    main()
