# EDA & data cleaning & data preprocess

In [2]:
import numpy as pd
import pandas as pd
import matplotlib.pyplot as plt


### Mount google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### import dataset

In [4]:
df =  pd.read_csv('/content/drive/MyDrive/topic extraction/data/stories_cleaned_tokenized.csv')

### check dataset and have idea about it

In [5]:
df.head()

Unnamed: 0,body,topic,cleaned
0,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...",hello welcome bbc news woman gave key evidence...
1,news now out of North Hollywood. A 14 yearold ...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],news north hollywood yearold girl found dead i...
2,homelessness his city's greatest failure. That...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '74e2...",homelessness city greatest failure message ton...
3,Minneapolis police officer Kim Potter guilty o...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...",minneapolis police officer kim potter guilty d...
4,Judy an update now to the wildfires that wiped...,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9a06...",judy update wildfire wiped entire neighborhood...


In [6]:
df.tail()

Unnamed: 0,body,topic,cleaned
5136,News. More local help will soon be on the way....,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9632...",news local help soon way group volunteer yonke...
5137,"with March 1, we start what is called Meteorol...",['9a06646a-e1df-4fca-888e-69658420556b'],march start called meteorological spring keep ...
5138,overseas. A massive Russian convoy is headed t...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],overseas massive russian convoy headed toward ...
5139,"And this morning, the National Hockey League s...","['9ff54ded-904b-4e0c-85ce-a3617f5cb913', 'b492...",morning national hockey league say suspending ...
5140,"thank you very much. Also this morning, the Un...","['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...",thank much also morning united state asking tw...


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5141 entries, 0 to 5140
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   body     5141 non-null   object
 1   topic    5141 non-null   object
 2   cleaned  5138 non-null   object
dtypes: object(3)
memory usage: 120.6+ KB


In [8]:
df.describe()

Unnamed: 0,body,topic,cleaned
count,5141,5141,5138
unique,5141,178,5137
top,hello and welcome to BBC News a woman who gave...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],vigil today yearold alexis gabe oakley last se...
freq,1,882,2


### module to tokenize text

In [None]:
! pip install -U gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


### use LDA as unsupervised learning to presict the title

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import CoherenceModel
from sklearn.model_selection import ParameterGrid

def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Removing URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Removing numbers
    text = re.sub(r'\d+', '', text)

    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Removing tabs and spaces
    text = re.sub(r'\s+', ' ', text)

    # Tokenization
    tokens = text.split()

    # Removing stopwords and non-English words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and is_english_word(token)]

    return tokens

def is_english_word(word):
    # Add your logic for checking if a word is English or not
    # You can use language detection libraries or other approaches

    # Placeholder implementation
    return True

def calculate_coherence(texts, num_topics, dictionary):
    # Preprocess the texts
    tokenized_texts = [preprocess_text(text) for text in texts]

    # Create a corpus (bag of words representation)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]

    # Create the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

    # Calculate the coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    return coherence_score

# Example usage with your dataset
dataset = df_story['cleaned'].copy()

# Preprocess the texts and create a dictionary
tokenized_texts = [preprocess_text(text) for text in dataset]
dictionary = corpora.Dictionary(tokenized_texts)

# Define the parameter grid for num_topics
param_grid = {'num_topics': [2, 2, 3, 5, 8, 10]}

# Perform grid search
best_score = -float('inf')
best_params = None
for params in ParameterGrid(param_grid):
    coherence_score = calculate_coherence(dataset, params['num_topics'], dictionary)
    if coherence_score > best_score:
        best_score = coherence_score
        best_params = params

# Print the best parameters and coherence score
print("Best Parameters:", best_params)
print("Best Coherence Score:", best_score)


Best Parameters: {'num_topics': 10}
Best Coherence Score: 0.5496618743107466


In [29]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models

def preprocess_text(text):

    # Lowercasing
    text = text.lower()

    # remove non English words
    text = re.sub(r'[^\x00-\x7f]',r'', text) 

    # Removing URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # remove letters that occur more than 2 times
    text = re.sub(r'(\w)\1{2,}', r'\1',text)
    
    # Removing numbers
    text = re.sub(r'\d+', '', text)

    # Removing more than one spaces
    text = re.sub(r'[\s]{2,}', ' ', text)

    # Removing punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Removing tabs and spaces
    text = re.sub(r'\s+', ' ', text)

    # remove tags and mentions
    text = re.sub(r'@\S*','',text)
    text = re.sub(r'#\S*','',text)
    
    # Tokenization
    tokens = text.split()

    # Removing stopwords and non-English words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words ]

    return tokens

def extract_topics(texts, num_topics):
    # Preprocess the texts
    tokenized_texts = [preprocess_text(text) for text in texts]

    # Create a dictionary from the tokenized texts
    dictionary = corpora.Dictionary(tokenized_texts)

    # Filter out rare and common tokens
    dictionary.filter_extremes(no_below=5, no_above=0.5)

    # Create a corpus (bag of words representation)
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]

    # Create the LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    lda_model.save('lda.model')

    # Extract the topics
    topics = lda_model.print_topics(num_topics=num_topics)

    return topics

# Example usage with your dataset
dataset = df['body'].copy()


# Extract topics
topics = extract_topics(dataset, num_topics=10)

# Print the topics
for topic in topics:
    print(topic)


(0, '0.031*"police" + 0.024*"say" + 0.018*"year" + 0.015*"old" + 0.014*"man" + 0.010*"shooting" + 0.010*"happened" + 0.009*"two" + 0.009*"suspect" + 0.009*"shot"')
(1, '0.028*"officer" + 0.020*"year" + 0.013*"today" + 0.009*"rivera" + 0.009*"life" + 0.009*"day" + 0.008*"nypd" + 0.008*"city" + 0.008*"old" + 0.008*"new"')
(2, '0.029*"new" + 0.018*"state" + 0.014*"york" + 0.013*"city" + 0.012*"governor" + 0.012*"president" + 0.011*"mayor" + 0.011*"today" + 0.010*"say" + 0.007*"adam"')
(3, '0.017*"morning" + 0.016*"going" + 0.014*"right" + 0.013*"see" + 0.011*"day" + 0.010*"well" + 0.009*"get" + 0.009*"snow" + 0.009*"good" + 0.009*"area"')
(4, '0.023*"mask" + 0.021*"mandate" + 0.016*"county" + 0.014*"vaccine" + 0.012*"people" + 0.012*"vaccinated" + 0.011*"covid" + 0.011*"get" + 0.010*"say" + 0.010*"health"')
(5, '0.053*"ukraine" + 0.034*"russian" + 0.032*"russia" + 0.024*"president" + 0.022*"u" + 0.019*"ukrainian" + 0.018*"putin" + 0.013*"troop" + 0.012*"invasion" + 0.011*"morning"')
(6, '