In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')  # Added to fix LookupError
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("tweets.csv")

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and mentions
    text = re.sub(r'@\w+|#\w+|[^\w\s]', '', text)
    # Remove emojis (basic regex for common emoji ranges)
    text = re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Save preprocessed data
df.to_csv("tweets_cleaned.csv", index=False)

# Word frequency analysis
# Split data by target
disaster_tweets = df[df['target'] == 1]['cleaned_text']
non_disaster_tweets = df[df['target'] == 0]['cleaned_text']

# Function to get word frequencies
def get_word_frequencies(text_series):
    all_words = ' '.join(text_series).split()
    return Counter(all_words)

# Function to get bigram frequencies
def get_bigram_frequencies(text_series):
    all_bigrams = []
    for text in text_series:
        tokens = text.split()
        all_bigrams.extend(list(bigrams(tokens)))
    return Counter(all_bigrams)

# Get word and bigram frequencies
disaster_freq = get_word_frequencies(disaster_tweets)
non_disaster_freq = get_word_frequencies(non_disaster_tweets)
disaster_bigram_freq = get_bigram_frequencies(disaster_tweets)
non_disaster_bigram_freq = get_bigram_frequencies(non_disaster_tweets)

# Print top 10 words and bigrams
print("Top 10 words in disaster tweets:", disaster_freq.most_common(10))
print("Top 10 words in non-disaster tweets:", non_disaster_freq.most_common(10))
print("Top 10 bigrams in disaster tweets:", disaster_bigram_freq.most_common(10))
print("Top 10 bigrams in non-disaster tweets:", non_disaster_bigram_freq.most_common(10))

# Save frequencies to CSV
disaster_freq_df = pd.DataFrame(disaster_freq.items(), columns=['word', 'count'])
non_disaster_freq_df = pd.DataFrame(non_disaster_freq.items(), columns=['word', 'count'])
disaster_freq_df.to_csv("disaster_word_freq.csv", index=False)
non_disaster_freq_df.to_csv("non_disaster_word_freq.csv", index=False)

# Exploratory Data Analysis (EDA)
# Most frequent word in each category
print("Most frequent word in disaster tweets:", disaster_freq.most_common(1))
print("Most frequent word in non-disaster tweets:", non_disaster_freq.most_common(1))

# Generate word clouds
disaster_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(disaster_freq)
non_disaster_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(non_disaster_freq)

# Plot word clouds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.imshow(disaster_wordcloud, interpolation='bilinear')
plt.title('Disaster Tweets Word Cloud')
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(non_disaster_wordcloud, interpolation='bilinear')
plt.title('Non-Disaster Tweets Word Cloud')
plt.axis('off')
plt.savefig("wordclouds.png")
plt.close()

# Plot histograms of top 10 words
disaster_top10 = dict(disaster_freq.most_common(10))
non_disaster_top10 = dict(non_disaster_freq.most_common(10))

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.bar(disaster_top10.keys(), disaster_top10.values())
plt.title('Top 10 Words in Disaster Tweets')
plt.xticks(rotation=45)
plt.subplot(1, 2, 2)
plt.bar(non_disaster_top10.keys(), non_disaster_top10.values())
plt.title('Top 10 Words in Non-Disaster Tweets')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("word_histograms.png")
plt.close()

# Find common words in both categories
common_words = set(disaster_freq.keys()).intersection(set(non_disaster_freq.keys()))
print("Common words in both categories:", common_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saram\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saram\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 10 words in disaster tweets: [('fire', 177), ('via', 121), ('suicide', 110), ('disaster', 109), ('california', 107), ('amp', 107), ('police', 106), ('people', 104), ('killed', 92), ('like', 92)]
Top 10 words in non-disaster tweets: [('like', 253), ('im', 243), ('amp', 193), ('new', 168), ('get', 163), ('dont', 141), ('one', 127), ('body', 111), ('via', 99), ('would', 97)]
Top 10 bigrams in disaster tweets: [(('suicide', 'bomber'), 59), (('northern', 'california'), 41), (('oil', 'spill'), 38), (('burning', 'buildings'), 35), (('suicide', 'bombing'), 34), (('california', 'wildfire'), 32), (('bomber', 'detonated'), 30), (('confirmed', 'mh'), 29), (('yr', 'old'), 29), (('homes', 'razed'), 29)]
Top 10 bigrams in non-disaster tweets: [(('cross', 'body'), 38), (('liked', 'video'), 34), (('gon', 'na'), 32), (('wan', 'na'), 30), (('body', 'bag'), 26), (('full', 'reû_'), 25), (('body', 'bagging'), 23), (('burning', 'buildings'), 23), (('full', 'read'), 22), (('looks', 'like'), 21)]
Most freq