In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Load your CSV files
file1 = "/content/td1.csv"
file2 = "/content/td2.csv"
file3 = "/content/td3.csv"

# Read the CSV files into dataframes
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

# Concatenate the dataframes into one
data = pd.concat([df1, df2, df3], ignore_index=True)

# Preprocessing: text cleaning, tokenization, and label encoding
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    if pd.isna(text):  # Check for missing values (NaN)
        return ""  # Return an empty string for missing values
    # Remove non-alphanumeric characters and convert to lowercase
    text = " ".join(word.lower() for word in str(text).split() if word.isalpha())
    # Tokenization
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

data['content'] = data['content'].apply(preprocess_text)

# Encode the 'label' column (e.g., 'yes' and 'no' to 1 and 0)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split the dataset into training and testing sets
X = data['content']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a text classification pipeline (you can customize this)
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('select_best', SelectKBest(chi2, k=2000)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42)),
])

# Train the model
text_clf.fit(X_train, y_train)

# Evaluate the model
y_pred = text_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

# If needed, you can perform hyperparameter tuning using GridSearchCV
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200, 300],
}

grid_search = GridSearchCV(text_clf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.9672131147540983
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.79      0.85       400
           1       0.91      0.96      0.93       798
           2       0.99      1.00      1.00      2401

    accuracy                           0.97      3599
   macro avg       0.94      0.92      0.93      3599
weighted avg       0.97      0.97      0.97      3599



In [None]:
pip install pandas



In [None]:
pip install pandas scikit-learn nltk




In [3]:
## USING RandomForest
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
import joblib  # For model saving

# Load and preprocess the dataset
data_files = ["/content/td1.csv", "/content/td2.csv", "/content/td3.csv"]
dataframes = []

for file in data_files:
    df = pd.read_csv(file)
    dataframes.append(df)

data = pd.concat(dataframes, ignore_index=True)

# Preprocessing: text cleaning, tokenization, and label encoding
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = " ".join(word.lower() for word in str(text).split() if word.isalpha())
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

data['content'] = data['content'].apply(preprocess_text)

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Train a text classification model
X = data['content']
y = data['label']

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

select_best = SelectKBest(chi2, k=2000)
X_new = select_best.fit_transform(X_tfidf, y)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_new, y)

# Save the trained model to a file
joblib.dump(model, "disaster_classification_model.pkl")

# Define a function to classify news
def classify_news(news_text):
    news_text = preprocess_text(news_text)
    news_tfidf = tfidf_vectorizer.transform([news_text])
    news_selected = select_best.transform(news_tfidf)
    prediction = model.predict(news_selected)
    return label_encoder.inverse_transform(prediction)[0]

# Console-based chatbot
print("Welcome to the Disaster News Classification Chatbot!")
while True:
    user_input = input("Enter a news statement (or 'exit' to quit): ")
    user_input=user_input.lower()
    if user_input.lower() == 'exit':
        break
    classification = classify_news(user_input)
    print(f"Chatbot Response: {classification}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Welcome to the Disaster News Classification Chatbot!
Enter a news statement (or 'exit' to quit): there was Tsunami of fans outside the cricket stadium
Chatbot Response: no
Enter a news statement (or 'exit' to quit): Odisha was hit by a Tsunami due to rise in water levels
Chatbot Response: yes
Enter a news statement (or 'exit' to quit): exit
