# Sentiment Classifier using Tfidf

In [14]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2023-03-24 13:50:32--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-03-24 13:50:34 (54.4 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [15]:
import shutil
shutil.unpack_archive("aclImdb_v1.tar.gz", "/content/")

In [17]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
import os
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

**DATAFRAMES FROM DATASET**

In [49]:
path = "/content/aclImdb/train/pos/"
temp = []
for file in os.listdir(path):
    with open(os.path.join(path + file), "r") as f:
        temp.append(f.readlines()[0])
pos = pd.DataFrame({"reviews": temp, 
                    "labels": list(np.ones(len(temp), dtype=int))})



In [39]:
pos.head()

Unnamed: 0,reviews,labels
0,If I had just seen the pilot of this show I wo...,1
1,I loved it! Fred MacMurray is wonderful as Ski...,1
2,"This series it's ""something different"". Someti...",1
3,"There's some very clever humour in this film, ...",1
4,When the Grinch came out I was excited though ...,1


In [54]:
train_test_split

<function sklearn.model_selection._split.train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)>

**TOKENIZATION**

In [25]:
def tokenize_data(dataset):
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    for i in range(dataset.shape[0]):
       dataset["reviews"][i] = tokenizer.tokenize(dataset["reviews"][i])
    return dataset

**STOP WORDS REMOVAL**

In [26]:
def remove_stop_words(dataset):
    stop_words = set(stopwords.words('english'))
    for i in range(dataset.shape[0]):
        dataset["reviews"][i] = ([token.lower() for token in dataset["reviews"][i] if token not in stop_words])
    
    return dataset

**NORMALIZATION**

In [27]:
def normalize(dataset):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    for i in range(dataset.shape[0]):
        dataset.reviews[i] = " ".join([lemmatizer.lemmatize(token) for token in dataset.reviews[i]]).strip()
    return dataset

**PUNCTUATION AND SYMBOLS REMOVAL**

In [33]:
def remove_garbage(dataset):
    garbage = "~`!@#$%^&*()_-+={[}]|\:;'<,>.?/"
    for i in range(dataset.shape[0]):
        dataset.reviews[i]="".join([char for char in dataset.reviews[i] if char not in garbage])
    return dataset

**FEATURE EXTRACTION**

In [36]:
def fit_corpus(train_data, test_data):
    corpus = pd.DataFrame({"reviews": train_data["reviews"]})
    corpus.reviews.append(test_data["reviews"], ignore_index=True)
    tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2))
    tfidf.fit(corpus["reviews"])
    return tfidf

In [37]:
def transform_data(tfidf, dataset):
    features = tfidf.transform(dataset["reviews"])
    return pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())

In [57]:
tfidf = fit_corpus(train_data, test_data)  
train_features = transform_data(tfidf, train_data)  #transforming 
test_features = transform_data(tfidf, test_data)    #Train and Test
train_labels = train_data["labels"]  #Taking lables in separate
test_labels = test_data["labels"]    #variables

NameError: ignored

In [58]:
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(train_features, train_labels)
clf.predict(test_features.iloc[[5]])

In [62]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load data
data = pd.read_csv('sentiment_data.csv')

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2)

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Fit vectorizer to training data and transform it
X_train_tfidf = tfidf.fit_transform(X_train)

# Train logistic regression classifier
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Transform test data using the same vectorizer
X_test_tfidf = tfidf.transform(X_test)

# Evaluate model on test data
accuracy = clf.score(X_test_tfidf, y_test)
print("Accuracy:", accuracy)


KeyError: ignored

In [17]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [18]:
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]
 

In [19]:
list = []
for index in range(0, len(movies)):
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)

In [20]:
for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
          ') -', 'Starring:', movie['star_cast'], movie['rating'])

1 - The Shawshank Redemption (1994) - Starring: Frank Darabont (dir.), Tim Robbins, Morgan Freeman 9.23574881449499
2 - The Godfather (1972) - Starring: Francis Ford Coppola (dir.), Marlon Brando, Al Pacino 9.155844693301388
3 - The Dark Knight (2008) - Starring: Christopher Nolan (dir.), Christian Bale, Heath Ledger 8.99171444223974
4 - The Godfather Part II (1974) - Starring: Francis Ford Coppola (dir.), Al Pacino, Robert De Niro 8.983856947540536
5 - 12 Angry Men (1957) - Starring: Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb 8.953134807145043
6 - Schindler's List (1993) - Starring: Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes 8.940125663414834
7 - The Lord of the Rings: The Return of the King (2003) - Starring: Peter Jackson (dir.), Elijah Wood, Viggo Mortensen 8.932478463671059
8 - Pulp Fiction (1994) - Starring: Quentin Tarantino (dir.), John Travolta, Uma Thurman 8.846235149945647
9 - The Lord of the Rings: The Fellowship of the Ring (2001) - Starring: Peter Jackson (dir

In [21]:
df = pd.DataFrame(list)
df.to_csv('imdb_top_250_movies.csv',index=False)

**Importing a CSV file using the read_csv function**

In [22]:
import pandas as pd


In [28]:
data = pd.read_csv("imdb_top_250_movies.csv")

In [29]:
data.head()

Unnamed: 0,place,movie_title,rating,year,star_cast
0,1,The Shawshank Redemption,9.235749,1994,"Frank Darabont (dir.), Tim Robbins, Morgan Fre..."
1,2,The Godfather,9.155845,1972,"Francis Ford Coppola (dir.), Marlon Brando, Al..."
2,3,The Dark Knight,8.991714,2008,"Christopher Nolan (dir.), Christian Bale, Heat..."
3,4,The Godfather Part II,8.983857,1974,"Francis Ford Coppola (dir.), Al Pacino, Robert..."
4,5,12 Angry Men,8.953135,1957,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb"


**Remove HTML Tags and emojis if required**

In [34]:
from html.parser import HTMLParser

In [38]:
import html
html.unescape

<function html.unescape(s)>

In [42]:
tweet="I enjoyd the event which took place yesteday & I lovdddd itttt! It's awesome you'll luv it #HadFun #Enjoyed" 


In [40]:
tweet = re.sub(r'#', '', tweet)

In [43]:
print("After removing Hashtags the tweet is:-\n{}".format(tweet))

After removing Hashtags the tweet is:-
I enjoyd the event which took place yesteday & I lovdddd itttt! It's awesome you'll luv it #HadFun #Enjoyed


**Stemming Words**

In [46]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer() 
words = ['run','ran','running']
for x in words:
    print(x, " : ", ps.stem(x))

run  :  run
ran  :  ran
running  :  run


In [47]:
ps.stem('runs')

'run'

**Stemming a Sentence**

In [48]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [49]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
sentence = "The root word is also called the stem and hence the name stemming" 
words = word_tokenize(sentence) 
for x in words:
    print(x, " : ", ps.stem(x))

The  :  the
root  :  root
word  :  word
is  :  is
also  :  also
called  :  call
the  :  the
stem  :  stem
and  :  and
hence  :  henc
the  :  the
name  :  name
stemming  :  stem


In [51]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def stemmer(text):
	return [porter.stem(word) for word in text.split()]
PorterStemmer().stem('complications')

'complic'

**Vectorizing with Tfidf**

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The cycle is ridden on the track.",
	"The bus is driven on the road.",
	"He is driving the bus."]

In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(text)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'the': 9, 'cycle': 1, 'is': 5, 'ridden': 7, 'on': 6, 'track': 10, 'bus': 0, 'driven': 2, 'road': 8, 'he': 4, 'driving': 3}
[1.28768207 1.69314718 1.69314718 1.69314718 1.69314718 1.
 1.28768207 1.69314718 1.69314718 1.         1.69314718]


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'Here is the first letter.',
    'This document is the second letter.',
    'And this is the third one.',
    'Is this any other letter?']

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(x.shape)

['and' 'any' 'document' 'first' 'here' 'is' 'letter' 'one' 'other'
 'second' 'the' 'third' 'this']
(4, 13)
