In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt_tab')

# Ensure necessary NLTK data packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load the dataset
data_path = '/content/reviews_dataset.csv'
data = pd.read_csv(data_path)

# Inspect the dataset columns
print("Dataset columns:", data.columns)

# Assuming the dataset contains 'news' and 'type' columns
if 'news' not in data.columns or 'type' not in data.columns:
    raise ValueError("Dataset must contain 'news' and 'type' columns")

# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize into words
    try:
        words = word_tokenize(text.lower())
    except LookupError:
        nltk.download('punkt')
        words = word_tokenize(text.lower())

    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

# Apply preprocessing
data['cleaned_text'] = data['news'].apply(preprocess_text)

# Generate bigrams and trigrams using CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 3))  # Unigrams, Bigrams, Trigrams
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=150, random_state=42)
rf_classifier.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Overall Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=y.unique())
print("Classification Report:\n", report)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Dataset columns: Index(['news', 'type'], dtype='object')
Overall Accuracy: 0.9258426966292135
Classification Report:
                precision    recall  f1-score   support

     business       0.87      0.97      0.92       115
entertainment       0.95      0.86      0.91        72
     politics       0.95      0.91      0.93        76
        sport       0.91      1.00      0.95       102
         tech       1.00      0.85      0.92        80

     accuracy                           0.93       445
    macro avg       0.94      0.92      0.92       445
 weighted avg       0.93      0.93      0.93       445

