In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud
from collections import Counter
from nltk import bigrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
nltk.download('vader_lexicon')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
df = pd.read_csv('Israel_Palestine_Public_Opinion_Dataset.csv')

# Table of Contents
1. [Reducing Dataset Size](#Reducing-Dataset-Size)
2. [Data Cleaning](#Data-Cleaning)
   1. [Dropping Irrelevant Features](#Dropping-irrelevant-features)
   2. [Dealing with Missing Values](#Dealing-with-missing-values)
   3. [Dealing with Duplicate Values](#Dealing-with-duplicate-values)
   4. [Understanding the Data](#Understanding-the-data)
3. [Data Preparation](#Data-Preparation)
   1. [Making all Text Lowercase](#Making-all-text-Lowercase)
   2. [Removing Irrelevant Characters](#Removing-Irrelevant-Characters)
   3. [Tokenising](#Tokenising)
   4. [Removing Stopwords](#Removing-Stopwords)
   5. [Lemmatization](#Lemmatization)
   6. [Joining & Finishing Up](#Joining-&-Finishing-Up)
4. [EDA](#EDA)
   1. [Most Common Words](#Most-Common-Words)
   2. [Word Cloud](#Word-Cloud)
   3. [Comment Length](#Comment-Length)
       1. [Removing Long Comments](#Removing-Long-Comments)
5. [Sentiment Analysis (Lexicon-Based)](#Sentiment-Analysis-(lexicon-based))
6. [Feature Extraction](#Feature-Extraction)
7. [Sentiment Prediction](#Sentiment-Prediction)
8. [Topic Modeling: LDA](#Topic-Modeling:-LDA)
9. [Supervised Model](#Supervised-Model)
10. [Downloading Dataset for Dashboard](#Downloading-Dataset-for-Dashboard)

# Reducing Dataset Size
The computational cost of 75000 comments is far to great to process.

In [5]:
df.shape

(75543, 6)

In [6]:
df = df.sample(frac=0.25)

# Data Cleaning

In [7]:
df.shape

(18886, 6)

In [8]:
df.head(10)

Unnamed: 0,comment_id,score,self_text,subreddit,created_time,created_date
59569,k85pcsy,236,"It’s not that they forgot, it’s that they don’...",worldnews,2023-11-07 02:02:41+00:00,2023-11-07
14965,k4vm5nc,1,Lmao they are downvoting you \n\nWhat a sub,AskMiddleEast,2023-10-14 18:53:27+00:00,2023-10-14
73868,k9g6mku,18,That's when the heavy hand of the IDF comes in...,worldnews,2023-11-16 02:38:30+00:00,2023-11-16
40909,k6skwta,2,"I mean, is there one?\nIts all interconnected ...",Palestine,2023-10-28 08:09:45+00:00,2023-10-28
1096,k3uafz0,3,If this is your response to civilians getting ...,CombatFootage,2023-10-07 11:45:45+00:00,2023-10-07
46101,k79l2ag,2,They’re killing a population made up of mostly...,PublicFreakout,2023-10-31 18:54:34+00:00,2023-10-31
22577,k5e24k4,5,Here is a sound comparison between the differe...,Palestine,2023-10-18 11:51:54+00:00,2023-10-18
73251,k9jgpw0,-3,To kill civilians.\n\nIt's always been the pla...,IsrealPalestineWar_23,2023-11-16 19:19:08+00:00,2023-11-16
21050,k5706ih,1,Diaspora Palestinian here. I have visited my ...,IsraelPalestine,2023-10-17 00:38:38+00:00,2023-10-17
74522,k9m2lhp,9,lol clearly failed at school. What clowns. Can...,Palestine,2023-11-17 07:46:08+00:00,2023-11-17


In [9]:
df.tail() 

Unnamed: 0,comment_id,score,self_text,subreddit,created_time,created_date
236,k3w6ooy,5,Just rid the nation of hamas like they did wit...,PublicFreakout,2023-10-07 19:39:37+00:00,2023-10-07
36692,k6fgkac,5,He is not Turkish just a wannabe.,AskMiddleEast,2023-10-25 18:31:43+00:00,2023-10-25
20566,k5bfi4v,7,Peak noncredibility,NonCredibleDefense,2023-10-17 21:36:15+00:00,2023-10-17
593,k3tx7ln,58,1. jewish holiday\n2. Yom-Kippur 50 year anive...,Palestine,2023-10-07 09:00:57+00:00,2023-10-07
9628,k4lh56u,1,They have confirmed in the last 2 hours. That ...,IsraelPalestine,2023-10-12 18:25:14+00:00,2023-10-12


## Dropping irrelevant features

In [10]:
df = df.drop(columns = ['created_time', 'created_date', 'subreddit'])

## Dealing with missing values

In [11]:
df.isnull().sum()

comment_id    0
score         0
self_text     0
dtype: int64

In [12]:
df = df.dropna(subset=['self_text'])

In [13]:
df.isnull().sum()

comment_id    0
score         0
self_text     0
dtype: int64

## Dealing with duplicate values

In [14]:
df.duplicated().sum()

0

In [15]:
df.shape

(18886, 3)

## Understanding the data

In [16]:
df.dtypes

comment_id    object
score          int64
self_text     object
dtype: object

In [17]:
df.describe()

Unnamed: 0,score
count,18886.0
mean,23.610717
std,147.22688
min,-781.0
25%,1.0
50%,2.0
75%,8.0
max,5531.0


# Data Preparation

## Making all text Lowercase

In [18]:
df['self_text'] = df['self_text'].str.lower()

## Removing Irrelevant Characters

In [19]:
df['self_text'] = df['self_text'].str.replace(r'http\S+|www\S+|https\S+', ' ', regex=True)
df['self_text'] = df['self_text'].str.replace(r"\d+", " ", regex=True)
df['self_text'] = df['self_text'].str.replace(r'[^\w\s]', '', regex=True)
# (Jerry, 2014)(DataScientYst - Data Science Simplified, 2021)(Python, 2009)

## Tokenising

In [20]:
df['tokens'] = df['self_text'].apply(nltk.word_tokenize)

## Removing Stopwords

In [21]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
nltk_stopwords = set(stopwords.words('english'))
spacy_stopwords = nlp.Defaults.stop_words
additional_stopwords = {'do', 'like', "s", "m", "re", 'l', 'i', 'I', 'they', 'now'}

In [23]:
combined_stopwords = set([word.lower() for word in nltk_stopwords.union(spacy_stopwords).union(additional_stopwords)])
# print(combined_stopwords)

In [24]:
df['filtered_tokens'] = df['tokens'].apply(lambda x: [word.lower() for word in x if word.lower() not in combined_stopwords and (len(word) > 1 or word.lower() in ('a', 'i'))])

## Lemmatization
(ame, 2018)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize_text(tokens):
    doc = nlp(' '.join(tokens))
    return [token.lemma_ for token in doc]

df['lemmatized'] = df['filtered_tokens'].apply(lemmatize_text)

## Joining & Finishing Up

In [None]:
df['text_wx'] = df['lemmatized'].apply(lambda x: ' '.join(x))

In [None]:
text = ' '.join(df['text_wx'].astype(str))
    # putting all the comments into one string

In [None]:
# df = df.drop(columns=['tokens', 'filtered_tokens', 'lemmatized'])
df = df[['comment_id', 'score', 'text_wx']].copy()

# EDA

In [None]:
df.head()

## Most Common Words

In [None]:
all_words = [word for text in df['text_wx'] for word in text.split()]
word_counts = Counter(all_words)

most_common_words = word_counts.most_common(30)
words, counts = zip(*most_common_words)

plt.figure(figsize=(20, 5))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Most Common Words')
plt.xticks(rotation=45)
plt.show()

In [None]:
all_words = [word for text in df['text_wx'] for word in text.split()]
word_counts = Counter(all_words)

most_common_words = word_counts.most_common(40)
words, counts = zip(*most_common_words)

plt.figure(figsize=(50, 10))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 10 Most Common Words')
plt.xticks(rotation=45)
plt.show()

## Word Cloud

In [None]:
wordcloud = WordCloud(width = 800, height = 800, 
                      background_color ='black', 
                      min_font_size = 10).generate(text)

plt.figure(figsize = (10, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

(DataCamp, n.d.)

## Comment Length

In [None]:
df['comment_length'] = df['text_wx'].str.len()

plt.figure(figsize=(10, 5))
plt.boxplot(df['comment_length'], vert=False)
plt.title('Box Plot of Comment Lengths')
plt.xlabel('Length of Comment')
plt.show()

### Removing Long Comments

In [None]:
df = df[df['comment_length'] <= 1800]

In [None]:
plt.figure(figsize=(10, 5))
plt.boxplot(df['comment_length'], vert=False)
plt.title('Box Plot of Comment Lengths after Trimming')
plt.xlabel('Length of Comment')
plt.show()

# Sentiment Analysis (lexicon-based)

In [None]:
df.sample(5)

In [None]:
SIA = SentimentIntensityAnalyzer()

df['sentiment'] = df['text_wx'].apply(lambda x: SIA.polarity_scores(x))

df['sentiment_score'] = df['sentiment'].apply(lambda x: x['compound'])
df['sentiment_label'] = df['sentiment_score'].apply(lambda c: 'positive' if c > 0.05 else ('negative' if c < -0.05 else 'neutral'))

In [None]:
sentiment_counts = df['sentiment_label'].value_counts()

sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='coolwarm')
plt.title('Sentiment Label Distribution')
plt.ylabel('Number of Comments')
plt.xlabel('Sentiment Label')
plt.show()

In [None]:
plt.hist(df['sentiment_score'], bins=20, color='skyblue', edgecolor='black')
plt.title('Histogram of Sentiment Scores')
plt.xlabel('Sentiment Score')
plt.ylabel('Number of Comments')
plt.show()

#  Feature Extraction

In [None]:
df.head()

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf_features = tfidf_vectorizer.fit_transform(df['text_wx'])

# Sentiment Prediction

In [None]:
y_pred = df['sentiment_label']
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, y_pred, test_size=0.2, random_state=42)
    # Text Analytics - Bag of Words Feature Extraction

In [None]:
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train, y_train)
sentiment_pred = model_tfidf.predict(X_test)


In [None]:
report_tfidf = classification_report(y_test, sentiment_pred, output_dict=True)

accuracy = report_tfidf['accuracy']
macro_avg = report_tfidf['macro avg']
weighted_avg = report_tfidf['weighted avg']

print(f"Accuracy: {accuracy:.2f}")
print(f"Macro Avg Precision: {macro_avg['precision']:.2f}, Recall: {macro_avg['recall']:.2f}, F1-score: {macro_avg['f1-score']:.2f}")
print(f"Weighted Avg Precision: {weighted_avg['precision']:.2f}, Recall: {weighted_avg['recall']:.2f}, F1-score: {weighted_avg['f1-score']:.2f}")

    # print(report_tfidf) couldn't print report as the output was too large

# Topic Modeling: LDA

In [None]:
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = count_vectorizer.fit_transform(df['text_wx'])

In [None]:
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(dtm)

In [None]:
tf_feature_names = count_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda_model.components_):
    print(f"Topic {topic_idx}:")
    top_words = [tf_feature_names[i] for i in topic.argsort()[-30:]]
    top_words.reverse() # Reversing order so that most important is shown first
    print(" ".join(top_words))

In [None]:
topic_results = lda_model.transform(dtm)

df['topic'] = topic_results.argmax(axis=1)

In [None]:
topic_names = {0: 'Ethics & Beliefs', 1: 'Military', 2: 'History', 3: 'Unclear Web Related', 4: 'Concern for Innocent'}

df['topic_label'] = df['topic'].map(topic_names)

In [None]:
df.head()

In [None]:
topic_label_counts = df['topic_label'].value_counts()
print(topic_label_counts)

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x=topic_label_counts.index, 
            y=topic_label_counts.values,
            palette="viridis")
plt.title('Comments by Topics')
plt.xlabel('Topic Labels')
plt.ylabel('Number of Comments')
plt.tight_layout() 
plt.show()

# Supervised Model

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf_features = tfidf_vectorizer.fit_transform(df['text_wx'])
    # Same as before

In [None]:
X = tfidf_features  
y = df['topic_label'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

# Downloading Dataset for Dashboard

In [None]:
df.shape

In [None]:
df.sample(20)

In [None]:
df.to_csv('C:/Users/danie/Documents/Predictive Data Analytics/Israel_Palestine_conflict_project/text_analytics_dataset.csv', index=False)
# Saving for use in Dashboard notebook