# Sentiment Analysis

In [2]:
import pandas as pd
from textblob import TextBlob

# Load dataset
df = pd.read_csv("IMDB_Dataset.csv")

# Sentiment analysis using TextBlob
def sentiment_textblob(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply sentiment analysis to review column
df['predicted_sentiment'] = df['review'].apply(sentiment_textblob)

# Compare with ground truth
df['true_sentiment'] = df['sentiment'].apply(lambda x: 'positive' if x == 'positive' else 'negative')

# Print first 10 rows of the dataframe
print(df.head(10))


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   
5  Probably my all-time favorite movie, a story o...  positive   
6  I sure would like to see a resurrection of a u...  positive   
7  This show was an amazing, fresh & innovative i...  negative   
8  Encouraged by the positive comments about this...  negative   
9  If you like original gut wrenching laughter yo...  positive   

  predicted_sentiment true_sentiment  
0            positive       positive  
1            positive       positive  
2            positive       positive  
3            negative       negative  
4            positive       positive  
5            positive       positive  


In [17]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.0.1-cp38-cp38-win_amd64.whl (10.8 MB)
     ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/10.8 MB 1.7 MB/s eta 0:00:07
     ---------------------------------------- 0.1/10.8 MB 1.7 MB/s eta 0:00:07
      --------------------------------------- 0.2/10.8 MB 1.4 MB/s eta 0:00:08
      --------------------------------------- 0.2/10.8 MB 1.3 MB/s eta 0:00:09
     - -------------------------------------- 0.3/10.8 MB 1.2 MB/s eta 0:00:09
     - -------------------------------------- 0.4/10.8 MB 1.3 MB/s eta 0:00:08
     -- ------------------------------------- 0.6/10.8 MB 1.7 MB/s eta 0:00:06
     --- ------------------------------------ 0.8/10.8 MB 2.1 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/10.8 MB 2.4 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/10.8 MB 2.4 MB/s eta 0:00:05
     ---- ----------------------------------- 1.2/10.8 MB 2

In [2]:
pip install gensim




In [2]:
pip install --upgrade gensim

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     - -------------------------------------- 0.0/1.5 MB 991.0 kB/s eta 0:00:02
     --- ------------------------------------ 0.1/1.5 MB 1.2 MB/s eta 0:00:02
     ------ --------------------------------- 0.2/1.5 MB 1.9 MB/s eta 0:00:01
     ---------- ----------------------------- 0.4/1.5 MB 2.0 MB/s eta 0:00:01
     --------------- ------------------------ 0.6/1.5 MB 2.5 MB/s eta 0:00:01
     ------------------- -------------------- 0.7/1.5 MB 2.6 MB/s eta 0:00:01
     --------------------- ------------------ 0.8/1.5 MB 2.4 MB/s eta 0:00:01
     --------------------------- ------------ 1.1/1.5 MB 2.6 MB/s eta 0:00:01
     ------------------------------- -------- 1.2/1.5 MB 2.7 MB/s eta 0:00:01
     ------------------------------------- -- 1.4/1.5 MB 2.7 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 2.7 MB/s eta 0:00:00
Co

In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

# Topic Modelling

In [2]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords

# Load dataset
df = pd.read_csv("topicmodelling.csv")
print(df.head())

   ID                                              TITLE   
0   1        Reconstructing Subject-Specific Effect Maps  \
1   2                 Rotation Invariance Neural Network   
2   3  Spherical polyharmonics and Poisson kernels fo...   
3   4  A finite element approximation for the stochas...   
4   5  Comparative study of Discrete Wavelet Transfor...   

                                            ABSTRACT  Computer Science   
0    Predictive models allow subject-specific inf...                 1  \
1    Rotation invariance and translation invarian...                 1   
2    We introduce and develop the notion of spher...                 0   
3    The stochastic Landau--Lifshitz--Gilbert (LL...                 0   
4    Fourier-transform infra-red (FTIR) spectra o...                 1   

   Physics  Mathematics  Statistics  Quantitative Biology  
0        0            0           0                     0  
1        0            0           0                     0  
2        0    

In [4]:
# Tokenize and preprocess the abstracts
stop_words = stopwords.words('english')
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in stop_words:
            result.append(token)
    return result
df['tokens'] = df['TITLE'].apply(preprocess)

# Create dictionary and corpus
dictionary = gensim.corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(tokens) for tokens in df['tokens']]

# Find the optimal number of topics using coherence score
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model.get_coherence()
        coherence_values.append((num_topics, coherence_score))
    return coherence_values

coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=df['tokens'], start=2, limit=20, step=1)
print(coherence_values)

# Train the LDA model
num_topics = 5
model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)

# Print the topics and the top words for each topic
for idx, topic in model.show_topics(num_topics=num_topics, formatted=False):
    print('Topic: {} \nWords: {}'.format(idx+1, [word[0] for word in topic]))

# Assign topics to documents
df['topic'] = [sorted(model[corpus[i]], key=lambda x: x[1], reverse=True)[0][0]+1 for i in range(len(df))]
print(df[['TITLE', 'topic']])

[(2, 0.31086878819878755), (3, 0.2609846481106636), (4, 0.3564830756183952), (5, 0.3951638349062978), (6, 0.487800669676959), (7, 0.45098883564552966), (8, 0.4726759909462879), (9, 0.4901286290248069), (10, 0.5138595384882116), (11, 0.48563886638181536), (12, 0.5303147873623111), (13, 0.5363552514068115), (14, 0.5461062959823407), (15, 0.5386613053494289), (16, 0.535654178687943), (17, 0.47736533251683233), (18, 0.5333581024799596), (19, 0.37355235500160694)]
Topic: 1 
Words: ['poisson', 'kernels', 'spherical', 'functions', 'polyharmonic', 'polyharmonics', 'neural', 'specific', 'rotation', 'frequency']
Topic: 2 
Words: ['system', 'gilbert', 'finite', 'approximation', 'element', 'lifshitz', 'maxwell', 'landau', 'stochastic', 'fundamental']
Topic: 3 
Words: ['wavelet', 'plants', 'train', 'ftir', 'medicinal', 'tensor', 'decomposition', 'study', 'data', 'feature']
Topic: 4 
Words: ['effects', 'collisions', 'mars', 'material', 'sph', 'state', 'scale', 'role', 'calculations', 'numerical']
To

# Text Generation

In [11]:
import nltk
import numpy as np
import random
import string

# Load the dataset
f = open('game_of_thrones.txt')
raw_text = f.read()
f.close()

# Preprocessing
raw_text = raw_text.lower() # convert to lowercase
sentences = nltk.sent_tokenize(raw_text) # tokenize into sentences
word_tokens = nltk.word_tokenize(raw_text) # tokenize into words
# Remove punctuations
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in word_tokens]
# Remove non-alphabetic words
words = [word for word in stripped if word.isalpha()]

# Build the Markov model
def build_model(words, order=2):
    model = {}
    for i in range(len(words)-order):
        gram = ' '.join(words[i:i+order])
        next_word = words[i+order]
        if gram in model:
            model[gram].append(next_word)
        else:
            model[gram] = [next_word]
    return model

# Generate a sentence
def generate_sentence(model, order=2, length=15):
    # Choose a random starting point
    n = random.randint(0, len(words)-order)
    current_gram = ' '.join(words[n:n+order])
    result = current_gram
    for i in range(length):
        if current_gram not in model:
            break
        possible_words = model[current_gram]
        next_word = possible_words[random.randint(0, len(possible_words)-1)]
        result += ' ' + next_word
        # Update current_gram
        current_gram = ' '.join(result.split()[-order:])
    return result

# Generate a paragraph of 10 sentences
paragraph = ''
for i in range(10):
    sentence = generate_sentence(build_model(words), length=15)
    paragraph += sentence.capitalize() + '. '
print(paragraph)


Hurry will rose ser waymar royce was the youngest son of an ancient gnarled ironwood and dismounted. Will said fear had made him insolent perhaps my lord would care to take orders from a. Long enough to kill eight grown men men clad in fur and leather let me remind you. And twisted like a tree struck by lightning will knelt looked around warily and snatched it up. Not with this horse will said fear had made him insolent perhaps my lord would care to. Too late the pale sword came shivering through the ringmail beneath his arm the young lord said. Of yet it was on the snow and the mud and looked down no fire gared s. A halfmoon rose will was grateful for the lordling the scars around his ear holes flushed red. If there are enemies in this wood a fire will turned away lead on he said we. From the dark of the firepit the snowcovered leanto the great sentinel was right there at the. 
