In [None]:
import os
import pandas as pd
import re
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")
# Download the VADER lexicon
nltk.download('vader_lexicon')
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Specify the file name
filename = 'TelegramData.csv'  # Replace with your actual file name

# Read the CSV file
df = pd.read_csv(filename)

# Fill NaN values in the 'text' column with an empty string
df['text'] = df['text'].fillna('')

# Process each row in the 'text' column
df['new_hashtags'] = df['text'].apply(lambda x: ' '.join(re.findall(r'#\w+', x)))
df['urls'] = df['text'].apply(lambda x: ' '.join(re.findall(r'http\S+', x)))

# Perform sentiment analysis on the 'Text in English language' column and save the results to a new column
def analyze_sentiment(text):
    sentiment_scores = sia.polarity_scores(text)
    return sentiment_scores['pos'], sentiment_scores['neu'], sentiment_scores['neg'], sentiment_scores['compound']

# Apply the analyze_sentiment function to the 'Text in English language' column
df['Positive'], df['Neutral'], df['Negative'], df['Compound'] = zip(*df['text'].apply(analyze_sentiment))

# Function to perform NER and separate entities by type
def extract_entities_by_type(text):
    doc = nlp(text)
    persons = [entity.text for entity in doc.ents if entity.label_ == "PERSON"]
    orgs = [entity.text for entity in doc.ents if entity.label_ == "ORG"]
    gpes = [entity.text for entity in doc.ents if entity.label_ == "GPE"]
    rel = [entity.text for entity in doc.ents if entity.label_ == "NORP"]
    dat = [entity.text for entity in doc.ents if entity.label_ == "DATE"]
    return persons, orgs, gpes, rel, dat

# Apply the function to each tweet and store the results in separate columns
df[['Persons', 'Organizations', 'Locations', 'Nationalities/religiious groups', 'Date']] = df['text'].apply(
    lambda x: pd.Series(extract_entities_by_type(x)))


# Save the processed dataframe to the same CSV file, overriding the old one
df.to_csv(filename, index=False)


[nltk_data] Downloading package vader_lexicon to C:\Users\Jonah
[nltk_data]     Dalton\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
