<a href="https://colab.research.google.com/github/DhanavathAkhil/AI-project/blob/main/code/AI_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')  # Ensure tokenizer also works


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import nltk
import os

nltk_data_path = "/usr/local/nltk_data"

# Ensure directory exists
os.makedirs(nltk_data_path, exist_ok=True)

# Download stopwords to the correct directory
nltk.download('stopwords', download_dir=nltk_data_path)

# Append the path manually so NLTK can find the data
nltk.data.path.append(nltk_data_path)

# Test stopwords again
from nltk.corpus import stopwords
print(stopwords.words('english')[:10])


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


[nltk_data] Downloading package stopwords to /usr/local/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# ✅ Import Required Libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# ✅ Fix NLTK Issues
nltk.download('punkt')
nltk.download('stopwords')

# ✅ Load Dataset with Encoding Fix
file_path = "/content/drive/My Drive/AI/data.csv"  # Adjust path accordingly
df = pd.read_csv(file_path, encoding="ISO-8859-1")

# ✅ Show Dataset Information
print("✅ Dataset loaded successfully!")
print("📌 Available Columns:", df.columns)

# ✅ Automatically Identify Text & Label Columns
possible_text_cols = ['text', 'article_content', 'content']
possible_label_cols = ['label', 'labels', 'category']

text_column = next((col for col in df.columns if col in possible_text_cols), df.columns[0])
label_column = next((col for col in df.columns if col in possible_label_cols), df.columns[1])

print(f"📌 Using Text Column: {text_column}")
print(f"📌 Using Label Column: {label_column}")

# ✅ Load Stopwords
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

# ✅ Text Preprocessing Function
def preprocess_text(text):
    text = str(text).lower().strip()  # Convert to lowercase & remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    words = [word for word in words if word not in stop_words]  # Remove stopwords

    return " ".join(words) if words else np.nan  # Return NaN if empty

# ✅ Apply Preprocessing
df['clean_text'] = df[text_column].apply(preprocess_text)

# ✅ Convert Labels to Binary if Needed
if df[label_column].dtype == 'object':
    df['label'] = df[label_column].astype('category').cat.codes  # Convert categorical labels
else:
    df['label'] = df[label_column]

# ✅ Remove Empty or NaN Rows After Preprocessing
df.dropna(subset=['clean_text'], inplace=True)

print(f"📌 Total Records After Preprocessing: {len(df)}")
if df.empty:
    print("❌ Error: No valid text data found after preprocessing! Exiting...")
    exit()

# ✅ Split Data into Training & Testing Sets
try:
    X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)
except ValueError as e:
    print(f"❌ Error in train-test split: {e}")
    exit()

print("✅ Sample preprocessed texts:\n", X_train.head())

# ✅ Convert Text to Numerical Representation using TF-IDF
vectorizer = TfidfVectorizer(min_df=1, max_features=5000)
try:
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
except ValueError as e:
    print(f"❌ Error: {e}")
    print("🔹 Possible reason: The vocabulary is empty due to excessive stopword removal.")
    exit()

# ✅ Train a Naïve Bayes Model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# ✅ Predictions
y_pred = model.predict(X_test_tfidf)

# ✅ Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.4f}")
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))

# ✅ Example Prediction
sample_text = "Breaking news: Stock market crashes due to economic instability!"
sample_text = preprocess_text(sample_text)
sample_text_tfidf = vectorizer.transform([sample_text])
prediction = model.predict(sample_text_tfidf)

if prediction[0] == 1:
    print("🛑 Fake News Detected!")
else:
    print("✅ Real News!")

# ✅ Example Prediction
sample_text = "narendra modi was removed from prime minister"
sample_text = preprocess_text(sample_text)
sample_text_tfidf = vectorizer.transform([sample_text])
prediction = model.predict(sample_text_tfidf)

if prediction[0] == 1:
    print("🛑 Fake News Detected!")
else:
    print("✅ Real News!")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Dataset loaded successfully!
📌 Available Columns: Index(['unit_id', 'article_title', 'article_content', 'source', 'date',
       'location', 'labels'],
      dtype='object')
📌 Using Text Column: article_content
📌 Using Label Column: labels
📌 Total Records After Preprocessing: 804
✅ Sample preprocessed texts:
 344    published august send httpsabahdailyesxnam lea...
350    published april syrian opposition forces sunda...
443    april zarif urges intl factfinding mission pro...
331    oct russian jets pounded several oppositionhel...
290    last updated aug beirut syrian russian warplan...
Name: clean_text, dtype: object
✅ Model Accuracy: 0.5901

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.25      0.35        72
           1       0.59      0.87      0.70        89

    accuracy                           0.59       161
   macro avg       0.59      0.56      0.53       161
weighted avg       0.59      0.59      0.54   