In [1]:
# handling file data
import pandas as pd
# handling numerical data
import numpy as np
# for ploting/visualisation
import matplotlib.pyplot as plt
import seaborn as sns 

# importing Natural Language Toolkit
import nltk

# data pre-processing
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# transform data
from sklearn.feature_extraction.text import TfidfVectorizer
# split into train and test
from sklearn.model_selection import train_test_split
# model building
from sklearn.linear_model import LogisticRegression
# 
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
train = pd.read_csv("/Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/dataset/train.csv")
test = pd.read_csv("/Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/dataset/test.csv")
submission = pd.read_csv("//Users/ankitbaliyan/Documents/VS_Code/Ongoing projects/NLP_Disaster/dataset/sample_sub.csv")

train_df=train[['text','target']]
test_df=test[['text']]

In [3]:
train_df.shape

(7613, 2)

In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# train dataset 

In [5]:

def preprocess_and_train(df):
    # Tokenization
    #nltk.download('punkt')
    df['text'] = df['text'].apply(nltk.word_tokenize)
    
    # Text Lowercasing
    df['text'] = df['text'].apply(lambda x: [word.lower() for word in x])
    
    # Stop Word Removal
    #nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])
    
    # Lemmatization
    #nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Removing Special Characters and Punctuation
    df['text'] = df['text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]", "", word) for word in x])
    
    # Handling Numerical Values and # Words
    df['text'] = df['text'].apply(lambda x: [re.sub(r"\d+", "NUM", word) for word in x])
    df['text'] = df['text'].apply(lambda x: [re.sub(r"#\w+", "", word) for word in x])
    
    # Vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['text'].apply(lambda x: " ".join(x)))
    y = df['target']
    
    # Split the dataset into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Build the logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(X_val)
    
    # Evaluate the model
    classification_repor = classification_report(y_val, y_pred)
    accuracy = model.score(X_val, y_val)
    
    print("Classification Report:")
    print(classification_repor)
    print("Accuracy:", accuracy)

    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))


In [6]:
preprocess_and_train(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(nltk.word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: [word.lower() for word in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83       874
           1       0.83      0.64      0.72       649

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.78      1523
weighted avg       0.80      0.79      0.78      1523

Accuracy: 0.7905449770190414
Confusion Matrix:
[[791  83]
 [236 413]]


# test dataset

In [7]:

# data preprocessing
def preprocess(train_df, test_df):
    # Concatenate train and test data for consistent preprocessing
    combined_df = pd.concat([train_df, test_df], axis=0)

    # Tokenization
    
    combined_df['text'] = combined_df['text'].apply(nltk.word_tokenize)

    # Text Lowercasing
    combined_df['text'] = combined_df['text'].apply(lambda x: [word.lower() for word in x])

    # Stop Word Removal
    
    stop_words = set(stopwords.words('english'))
    combined_df['text'] = combined_df['text'].apply(lambda x: [word for word in x if word not in stop_words])

    # Lemmatization
    
    lemmatizer = WordNetLemmatizer()
    combined_df['text'] = combined_df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # Removing Special Characters and Punctuation
    combined_df['text'] = combined_df['text'].apply(lambda x: [re.sub(r"[^a-zA-Z0-9]", "", word) for word in x])

    # Handling Numerical Values and # Words
    combined_df['text'] = combined_df['text'].apply(lambda x: [re.sub(r"\d+", "NUM", word) for word in x])
    combined_df['text'] = combined_df['text'].apply(lambda x: [re.sub(r"#\w+", "", word) for word in x])

    # Filter out documents that are empty or contain only stop words
    #combined_df['text'] = combined_df['text'].apply(lambda x: [word for word in x if word.strip() != ''])
    #combined_df = combined_df[combined_df['text'].apply(lambda x: len(x) > 0)]
    return combined_df


# making predictions
def predict(train_df, test_df):
    # Vectorization
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_df['text'].apply(lambda x: " ".join(x)))
    X_test = vectorizer.transform(test_df['text'].apply(lambda x: " ".join(x)))
    y_train = train_df['target']

    # Check if vocabulary is empty
    if not vectorizer.vocabulary_:
        raise ValueError("Empty vocabulary. Check the preprocessing steps, as all documents may contain only stop words.")

    # Build the logistic regression model
    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    return y_pred

# accuracy
def check_accuracy(y_val, y_pred):
    # Evaluate the model
    classification_repor = classification_report(y_val, y_pred)
    
    print("Classification Report:")
    print(classification_repor)


In [8]:
# test data preprocessing
combined_df = preprocess(train,test)
# separating train and test data
i= train.shape[0]
train = combined_df.iloc[:i]
test = combined_df.iloc[i:]


In [9]:
submission['target'] = predict(train, test)

In [16]:
train['target'].value_counts()

0.0    4342
1.0    3271
Name: target, dtype: int64

In [17]:
submission['target'].value_counts()

0.0    2058
1.0    1205
Name: target, dtype: int64

In [12]:
submission.head()

Unnamed: 0,id,target
0,0,1.0
1,2,1.0
2,3,1.0
3,9,1.0
4,11,1.0


In [13]:
#submission.to_csv("/Users/ankitbaliyan/Documents/VS_Code/NLP_Disaster/subission_file.csv", index=False)