In [4]:
import pandas as pd
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Custom Roman Urdu stopwords
STOP_WORDS = set([
    "ka", "ki", "kaun", "kon", "kya", "aur", "se", "mein", "tu", "tum", "main", "wo", "woh",
    "to", "mn", "ja", "rha", "ye", "yeh", "is", "ko", "tha", "thi", "ke", "ho", "raha", "rahe",
    "bhi", "par", "ab", "hain", "hun", "tak", "jab", "sirf", "liye", "chal", "gaya", "gayi", "gai", 
    "wahan", "ahan", "kyun", "kis", "hona", "hoti", "hota", "kar", "karo", "karta", "karte"
])

In [6]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    words = [w for w in text.split() if w not in STOP_WORDS]
    return " ".join(words)

In [None]:
dataset_path = r"C:\Users\hp\OneDrive\Desktop\ABS\University\Sentiment_Analysis\Dataset 11000 Reviewss.csv"  

# Reading the CSV dataset
df = pd.read_csv(dataset_path, encoding="latin1")  

In [9]:
df = df.rename(columns={
    'label': 'sentiment',   # Change if your label column has a different name
    'review': 'review'      # Change if your review column has a different name
})


In [10]:
# Preprocess reviews
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [13]:
# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Prediction function
def predict_sentiment(text):
    cleaned = preprocess_text(text)
    vec = vectorizer.transform([cleaned])
    return model.predict(vec)[0]

# Test with your own review
while True:
    user_input = input("\nEnter Roman Urdu review (or 'exit' to stop): ")
    if user_input.lower() == "exit":
        break
    print("Predicted Sentiment:", predict_sentiment(user_input))

Accuracy: 81.18%

Classification Report:
               precision    recall  f1-score   support

         neg       0.81      0.81      0.81      1602
         pos       0.82      0.82      0.82      1698

    accuracy                           0.81      3300
   macro avg       0.81      0.81      0.81      3300
weighted avg       0.81      0.81      0.81      3300


Confusion Matrix:
 [[1290  312]
 [ 309 1389]]
Predicted Sentiment: neg
Predicted Sentiment: pos
Predicted Sentiment: pos
Predicted Sentiment: pos
Predicted Sentiment: neg
Predicted Sentiment: pos
