In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Replace the path with the directory where your dataset is located
dataset_directory = r'/content/drive/MyDrive/FINAL/training.1600000.processed.noemoticon.csv'

for dirname, _, filenames in os.walk(dataset_directory):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import pandas as pd
import numpy as np
# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# nltk
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Importing the dataset
DATASET_COLUMNS = ['target', 'ids', 'date', 'flag', 'user', 'text']
DATASET_ENCODING = "ISO-8859-1"

# Replace the path with the local path to your Sentiment140 dataset
local_dataset_path = r'/content/drive/MyDrive/FINAL/training.1600000.processed.noemoticon.csv'

df = pd.read_csv(local_dataset_path, encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
df.head()


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   ids     1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [None]:
df.tail()

Unnamed: 0,target,ids,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [None]:
df.shape

(1600000, 6)

In [None]:
df.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [None]:
df['target'].unique()

array([0, 4])

In [None]:
df['target'].nunique()

2

In [None]:
dataset=df[['text','target']]

In [None]:
dataset['target'] = dataset['target'].replace(4,1)

In [None]:
dataset['target'].unique()

array([0, 1])

In [None]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [None]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

In [None]:
text

"<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

In [None]:
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [None]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [None]:
text1 = 'Check out my notebook https://www.kaggle.com/ubaidshah/notebook8223fc1abb'
text2 = 'Check out my notebook http://www.kaggle.com/ubaidshah/notebook8223fc1abb'
text3 = 'Google search here www.google.com'
text4 = 'For notebook click https://www.kaggle.com/ubaidshah/notebook8223fc1abb to search check www.google.com'
#SONTHAPRAYTHNAM
text5 = 'HAI I AM MADHAV https://madhav.com'

In [None]:
remove_url(text5)

'HAI I AM MADHAV '

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_url(x))
dataset['text'].tail()

1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com - Very cool to hear old Walt interv...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: text, dtype: object

In [None]:
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [None]:
def handel_emoji(text):
    for emoji in emojis.keys():
        text = text.replace(emoji, "EMOJI" + emojis[emoji])

    return text

In [None]:
handel_emoji("@stustone Your show is whack. Way worse than whack, it's wiggety-whack.  :(:(:(")

"@stustone Your show is whack. Way worse than whack, it's wiggety-whack.  EMOJIsadEMOJIsadEMOJIsad"

In [None]:
dataset['text']=dataset['text'].apply(lambda x:handel_emoji(x) )

In [None]:
url1='https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt'
slang='/kaggle/input/slang-words/slang.txt'

In [None]:
slang

'/kaggle/input/slang-words/slang.txt'

In [None]:
slang =r'/content/drive/MyDrive/FINAL/slang.txt'

In [None]:
with open(slang,'r') as f:
    lines = f.readlines()

In [None]:
lines

['AFAIK=As Far As I Know\n',
 'AFK=Away From Keyboard\n',
 'ASAP=As Soon As Possible\n',
 'ATK=At The Keyboard\n',
 'ATM=At The Moment\n',
 'A3=Anytime, Anywhere, Anyplace\n',
 'BAK=Back At Keyboard\n',
 'BBL=Be Back Later\n',
 'BBS=Be Back Soon\n',
 'BFN=Bye For Now\n',
 'B4N=Bye For Now\n',
 'BRB=Be Right Back\n',
 'BRT=Be Right There\n',
 'BTW=By The Way\n',
 'B4=Before\n',
 'B4N=Bye For Now\n',
 'CU=See You\n',
 'CUL8R=See You Later\n',
 'CYA=See You\n',
 'FAQ=Frequently Asked Questions\n',
 'FC=Fingers Crossed\n',
 "FWIW=For What It's Worth\n",
 'FYI=For Your Information\n',
 'GAL=Get A Life\n',
 'GG=Good Game\n',
 'GN=Good Night\n',
 'GMTA=Great Minds Think Alike\n',
 'GR8=Great!\n',
 'G9=Genius\n',
 'IC=I See\n',
 'ICQ=I Seek you (also a chat program)\n',
 'ILU=ILU: I Love You\n',
 'IMHO=In My Honest/Humble Opinion\n',
 'IMO=In My Opinion\n',
 'IOW=In Other Words\n',
 'IRL=In Real Life\n',
 'KISS=Keep It Simple, Stupid\n',
 'LDR=Long Distance Relationship\n',
 'LMAO=Laugh My A..

In [None]:
(lines[0].split('='))[1][:-1]

'As Far As I Know'

In [None]:
chat_words = dict()

for line in lines:
    # Check if the line contains '=' and is not empty
    if '=' in line and line.strip():
        key, value = line.split('=')
        chat_words[key.strip()] = value.strip()

# Now chat_words should contain the key-value pairs from the file


In [None]:
chat_words

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'ILU: I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LM

In [None]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [None]:
dataset.head(10)

Unnamed: 0,text,target
0,"@switchfoot - Awww, that's a bummer. You sho...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
5,@Kwesidei not the whole crew,0
6,Need a hug,0
7,@LOLTrish hey long time no see! Yes.. Rains a...,0
8,@Tatiana_K nope they didn't have it,0
9,@twittera que me muera ?,0


In [None]:
print(chat_conversion(dataset.iloc[7][0]))

@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit Laughing Out Loud , I'm fine thanks , how's you ?


In [None]:
dataset['text'] = dataset['text'].apply(lambda x: chat_conversion(x))
dataset['text'].tail()

1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com - Very cool to hear old Walt interv...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999    happy #charitytuesday @theNSPCC @SparksCharity...
Name: text, dtype: object

In [None]:
print(dataset.iloc[7][0])

@LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit Laughing Out Loud , I'm fine thanks , how's you ?


In [None]:
import string
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
exclude = string.punctuation
def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

In [None]:
text = 'string. With. Punctuation?'

In [None]:
remove_punc(text)

'string With Punctuation'

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: remove_punc(x))
dataset['text'].tail()

1599995    Just woke up Having no school is the best feel...
1599996    TheWDBcom  Very cool to hear old Walt intervie...
1599997    Are you ready for your MoJo Makeover Ask me fo...
1599998    Happy 38th Birthday to my boo of alll time Tup...
1599999    happy charitytuesday theNSPCC SparksCharity Sp...
Name: text, dtype: object

In [None]:
dataset['text']=dataset['text'].str.lower()
dataset['text'].tail()


1599995    just woke up having no school is the best feel...
1599996    thewdbcom  very cool to hear old walt intervie...
1599997    are you ready for your mojo makeover ask me fo...
1599998    happy 38th birthday to my boo of alll time tup...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: text, dtype: object

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize


In [None]:
sent_tokenize("@stustone Your show is whack. Way worse than whack, it's wiggety-whack.  EMOJIsadEMOJIsadEMOJIsad")

['@stustone Your show is whack.',
 "Way worse than whack, it's wiggety-whack.",
 'EMOJIsadEMOJIsadEMOJIsad']

In [None]:
sent=sent_tokenize("@stustone Your show is whack. Way worse than whack, it's wiggety-whack.  EMOJIsadEMOJIsadEMOJIsad")

In [None]:
wt=word_tokenize(("@stustone Your show is whack. Way worse than whack, it's wiggety-whack.  EMOJIsadEMOJIsadEMOJIsad"))

In [None]:
wt


['@',
 'stustone',
 'Your',
 'show',
 'is',
 'whack',
 '.',
 'Way',
 'worse',
 'than',
 'whack',
 ',',
 'it',
 "'s",
 'wiggety-whack',
 '.',
 'EMOJIsadEMOJIsadEMOJIsad']

In [None]:
def word_tokenize(text):
    return text

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
sample_words = [word for word in wt if word not in stopwords.words('english')]

In [None]:
print(" ".join(sample_words))

@ stustone Your show whack . Way worse whack , 's wiggety-whack . EMOJIsadEMOJIsadEMOJIsad


In [None]:
" ".join(wt)

"@ stustone Your show is whack . Way worse than whack , it 's wiggety-whack . EMOJIsadEMOJIsadEMOJIsad"

In [None]:
def token_split(text):
    lis_w=text.split()
    return lis_w


In [None]:
def token_split(text):
    lis_w=text.split()
    return lis_w

dataset['text']=dataset['text'].apply(lambda x:token_split(x))

In [None]:
# Assuming 'preprocessed_dataset.csv' as the desired CSV file name
csv_file_path = r'/content/drive/MyDrive/FINAL/preprocessed_dataset.csv'
dataset.to_csv(csv_file_path, index=False)

print(f"DataFrame saved to CSV file: {csv_file_path}")


DataFrame saved to CSV file: /content/drive/MyDrive/FINAL/preprocessed_dataset.csv


In [None]:
def token_split(word_list):
    text = ' '.join(word_list)
    lis_w = text.split()
    return lis_w

dataset['text'] = dataset['text'].apply(token_split)


In [None]:
dataset.head()

Unnamed: 0,text,target
0,"[switchfoot, awww, thats, a, bummer, you, shou...",0
1,"[is, upset, that, he, cant, update, his, faceb...",0
2,"[kenichan, i, dived, many, times, for, the, ba...",0
3,"[my, whole, body, feels, itchy, and, like, its...",0
4,"[nationwideclass, no, its, not, behaving, at, ...",0


In [None]:
from nltk.stem.porter import PorterStemmer
st=PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return " ".join(text)

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: stemming_on_text(x))
dataset['text'].head()

0    switchfoot awww that a bummer you shoulda got ...
1    is upset that he cant updat hi facebook by tex...
2    kenichan i dive mani time for the ball manag t...
3         my whole bodi feel itchi and like it on fire
4    nationwideclass no it not behav at all im mad ...
Name: text, dtype: object

In [None]:
dataset.head()

Unnamed: 0,text,target
0,switchfoot awww that a bummer you shoulda got ...,0
1,is upset that he cant updat hi facebook by tex...,0
2,kenichan i dive mani time for the ball manag t...,0
3,my whole bodi feel itchi and like it on fire,0
4,nationwideclass no it not behav at all im mad ...,0


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
lm = WordNetLemmatizer()
def lemmatizer_on_text(text):
    data = [lm.lemmatize(word,pos='v') for word in text.split()]
    return " ".join(data)

In [None]:
lemmatizer_on_text((dataset['text'][0]))

'switchfoot awww that a bummer you shoulda get david carr of third day to do it d'

In [None]:
dataset.head()

Unnamed: 0,text,target
0,switchfoot awww that a bummer you shoulda got ...,0
1,is upset that he cant updat hi facebook by tex...,0
2,kenichan i dive mani time for the ball manag t...,0
3,my whole bodi feel itchi and like it on fire,0
4,nationwideclass no it not behav at all im mad ...,0


In [None]:
dataset['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [None]:
# Assuming 'preprocessed_dataset.csv' as the desired CSV file name
csv_file_path = r'\content\drive\MyDrive\preprocessed data.csv'
dataset.to_csv(csv_file_path, index=False)

print(f"DataFrame saved to CSV file: {csv_file_path}")


DataFrame saved to CSV file: \content\drive\MyDrive\preprocessed data.csv


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['target'], test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500000,ngram_range=(1,3),stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
X_train_tfidf

<1280000x500000 sparse matrix of type '<class 'numpy.float64'>'
	with 13240571 stored elements in Compressed Sparse Row format>

In [None]:
X_train_tfidf.shape

(1280000, 500000)

In [None]:
print(X_train_tfidf)


  (0, 493211)	0.29834195093864857
  (0, 39736)	0.29347410161564935
  (0, 152206)	0.2601833697075296
  (0, 387263)	0.17695261775796636
  (0, 493210)	0.2558921586596052
  (0, 318457)	0.2097638824999451
  (0, 246864)	0.28613400086168095
  (0, 338155)	0.29834195093864857
  (0, 330721)	0.2667775900147538
  (0, 342967)	0.1257573835472715
  (0, 39632)	0.1615720544835319
  (0, 152028)	0.08532121155440585
  (0, 387139)	0.12790043021268002
  (0, 492500)	0.12060195911210453
  (0, 69702)	0.19440914482091012
  (0, 431303)	0.2894967791181781
  (0, 330576)	0.18943920483860283
  (0, 318452)	0.19215728129537726
  (0, 244938)	0.08726139343324411
  (0, 338154)	0.272629736123566
  (0, 490992)	0.13365963344990286
  (1, 412108)	0.2968987666105346
  (1, 112728)	0.4830990941274658
  (1, 131529)	0.4413299809185169
  (1, 119582)	0.4619914892132683
  :	:
  (1279998, 405196)	0.3534390256189521
  (1279998, 324499)	0.2940173875483023
  (1279998, 446068)	0.2744729376166989
  (1279998, 461334)	0.24122768667640923
  (

In [None]:
print("Feature Names n",tfidf_vectorizer.get_feature_names_out())

Feature Names n ['00' '000' '000 follow' ... 'ø¹ù ø¹ù' 'ùø' 'ùù']


In [None]:
for i, feature in enumerate(tfidf_vectorizer.get_feature_names_out()):
    print(i, feature)
    if i>=10000:
        break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5001 545 thi
5002 545 today
5003 545 work
5004 545am
5005 545pm
5006 548
5007 55
5008 55 beach
5009 55 beach pilot
5010 55 day
5011 55 degre
5012 55 hour
5013 55 hr
5014 55 mile
5015 55 min
5016 55 minut
5017 55 whale
5018 55 year
5019 550
5020 5500
5021 5500 free
5022 5500 free sh
5023 550am
5024 552
5025 5530
5026 555
5027 5555
5028 555am
5029 556
5030 559
5031 55hr
5032 55quot
5033 55secretstreet
5034 55th
5035 56
5036 56 day
5037 56 hour
5038 56 min
5039 56 minut
5040 56 week
5041 56 year
5042 560
5043 562
5044 565
5045 567
5046 568
5047 56hr
5048 56quot
5049 57
5050 57 day
5051 57 degre
5052 57 mile
5053 57 minut
5054 570
5055 573
5056 573ff1
5057 575
5058 57th
5059 58
5060 58 day
5061 58 degre
5062 58 minut
5063 580
5064 5800
5065 5800 nokia
5066 5800xm
5067 585
5068 58pm
5069 58th
5070 59
5071 59 day
5072 59 minut
5073 590
5074 5900
5075 595
5076 599
5077 59quot
5078 59th
5079 5a
5080 5am
5081 5am 6am
5082 5am amp
