In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score)

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\23324\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\23324\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Importing the datasets
# Import the training dataset with only the important colums
train_df = pd.read_csv("datasets/train.csv")

# Column 3 is the column of interest in the testing set
test_df = pd.read_csv("datasets/test.csv")

In [4]:
#Explore train dataset
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [5]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [6]:
# Cleaning the dataset

def clean_text(text):
    if isinstance(text, str):
        # Remove non-alphanumeric characters and convert to lowercase
        text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())

        # Tokenize the text into individual words
        words = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        # Join the words back into a single string
        clean_text = ' '.join(words)

        return clean_text


In [7]:
# Applying preprocessing function on the text column of the training dataset
train_df['cleaned_text'] = train_df['text'].apply(clean_text)

# Converting sentiments to numerical form
sentiment_mapping = {'neutral': 0, 'positive': 1, 'negative': -1}
train_df['sentiment'] = train_df['sentiment'].map(sentiment_mapping)
print("Number of rows before dropna:", train_df.shape[0])

# Dropping rows with missing values
train_df.dropna(inplace=True)
print("Number of rows after dropna:", train_df.shape[0])

# Print the preprocessed train dataset
print(train_df.head())

Number of rows before dropna: 27481
Number of rows after dropna: 27480
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text  sentiment  \
0  I`d have responded, if I were going          0   
1                             Sooo SAD         -1   
2                          bullying me         -1   
3                       leave me alone         -1   
4                        Sons of ****,         -1   

                       cleaned_text  
0                   responded going  
1           sooo sad miss san diego  
2                     boss bullying  
3             interview leave alone  
4  sons put

In [8]:
# Splitting the training dataset into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(train_df.cleaned_text, train_df.sentiment, test_size=0.2, random_state=24)


In [9]:
# Initialising the TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)

vectorizer.fit(list(X_train) + list(X_test))
X_train_vectorizer =  vectorizer.transform(X_train) 
X_test_vectorizer = vectorizer.transform(X_test)


In [10]:
log_reg = LogisticRegression(random_state=0, multi_class='multinomial', max_iter=1000)
log_reg.fit(X_train_vectorizer, y_train)

LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=0)

In [11]:
val_pred = log_reg.predict(X_test_vectorizer)
print("The accurary of logistic regression is",accuracy_score(list(y_test), val_pred))

The accurary of logistic regression is 0.6857714701601164


In [12]:
# Finding out which TF-IDF features contribute the most to determining the sentiment label
feature_names = vectorizer.get_feature_names_out()
# Coefficients indicate the importance of each feature in the classification
feature_importance = log_reg.coef_
feature_importance_df = pd.DataFrame(feature_importance, columns=feature_names)
print(feature_importance_df)

          0        00       000        01        03        04        05  \
0 -0.043028  0.539402 -0.216714  0.001622  0.244835  0.225411  0.087108   
1  0.170857 -0.098659 -0.052156  0.200661  0.018805 -0.165755 -0.038555   
2 -0.127829 -0.440744  0.268870 -0.202283 -0.263640 -0.059656 -0.048553   

         06        07        08  ...        ze   zealand      zero       zoe  \
0  0.313249 -0.056644 -0.243518  ...  0.140226  0.014948 -0.494884 -0.187081   
1 -0.021825  0.209610  0.205157  ... -0.219648 -0.147208  0.461392  0.244079   
2 -0.291424 -0.152965  0.038361  ...  0.079422  0.132260  0.033492 -0.056998   

     zombie   zombies      zone       zoo      zulu      zzzz  
0  0.095797  0.031117  0.013116 -0.326490 -0.311350  0.008246  
1 -0.031871 -0.048884  0.281774  0.247602 -0.005105 -0.096620  
2 -0.063926  0.017767 -0.294890  0.078888  0.316455  0.088374  

[3 rows x 12796 columns]


In [13]:
num_top_features = 500

# Create a dictionary to store the top features for each sentiment label
top_features_per_sentiment = {
    -1: feature_importance_df.loc[0].nlargest(num_top_features).index.tolist(),
    0: feature_importance_df.loc[1].nlargest(num_top_features).index.tolist(),
    1: feature_importance_df.loc[2].nlargest(num_top_features).index.tolist()
}

for index, row in train_df.iterrows():
    text = row['text']
    sentiment_label = int(row['sentiment'])  # Convert sentiment label to integer
    
    if sentiment_label in top_features_per_sentiment:
        top_features = top_features_per_sentiment[sentiment_label]  # All features for the sentiment label
        sentiment_parts = [feature for feature in top_features if feature in text]
    else:
        sentiment_parts = []  # No top features available for the sentiment label
    
    print(f"Sentiment parts for row {index}: {sentiment_parts}")

Sentiment parts for row 0: ['ed', 'goin', 'going', 'go', 'sp', 'de']
Sentiment parts for row 1: ['miss']
Sentiment parts for row 2: []
Sentiment parts for row 3: ['nt', 'ew']
Sentiment parts for row 4: ['ugh', 'already', 'c']
Sentiment parts for row 5: ['less', 'http', 'fo', 'un', 'th']
Sentiment parts for row 6: ['fun', 'smile', 'g', 'smiles', 'mi']
Sentiment parts for row 7: ['oooo']
Sentiment parts for row 8: ['th']
Sentiment parts for row 9: ['cool', 'hehe']
Sentiment parts for row 10: ['ko', 'chan', 'go', 'th']
Sentiment parts for row 11: ['g', 'like']
Sentiment parts for row 12: []
Sentiment parts for row 13: ['lost', 'nt', 'c']
Sentiment parts for row 14: ['th']
Sentiment parts for row 15: ['burned', 'sunburned', 'oh', 'sunburn', 'burn']
Sentiment parts for row 16: ['sigh', 'iv', 'na']
Sentiment parts for row 17: ['sick', 'nt', 'ew', 'yu', 'c']
Sentiment parts for row 18: ['miss', 'every', 'c', 'na']
Sentiment parts for row 19: ['th']
Sentiment parts for row 20: []
Sentiment par

Sentiment parts for row 649: ['ed', 'sp']
Sentiment parts for row 650: ['th']
Sentiment parts for row 651: []
Sentiment parts for row 652: ['nt']
Sentiment parts for row 653: ['fun', 'funny', 'haha', 'lol', 'ahaha', 'aha']
Sentiment parts for row 654: ['ouch', 'c']
Sentiment parts for row 655: ['ugh', 'worse', 'even', 'thought', 'life', 'c']
Sentiment parts for row 656: ['th']
Sentiment parts for row 657: ['g', 'lol', 'would', 'grad']
Sentiment parts for row 658: ['10 minutes', 'fo', 'th']
Sentiment parts for row 659: ['cute', 'g', 'ever', 'night']
Sentiment parts for row 660: ['happy', 'g', 'day', 'ever']
Sentiment parts for row 661: ['ah']
Sentiment parts for row 662: ['home', 'th']
Sentiment parts for row 663: ['man', 'inbox', 'c']
Sentiment parts for row 664: ['ppl', 'c']
Sentiment parts for row 665: ['sorry', 'sick', 'nt', 'c']
Sentiment parts for row 666: ['nice']
Sentiment parts for row 667: ['ed', 'renewed', 'th']
Sentiment parts for row 668: ['love', 'mothers', 'win', 'g', 'da

Sentiment parts for row 1563: ['g', 'tool']
Sentiment parts for row 1564: ['due', 'im', 'nt', 'iv', 'nerd', 'mean', 'c']
Sentiment parts for row 1565: ['th', 'de']
Sentiment parts for row 1566: ['die', 'dragged']
Sentiment parts for row 1567: ['ugh', 'enough', 'oh', 'good enough']
Sentiment parts for row 1568: ['g']
Sentiment parts for row 1569: ['drugs', 'workin', 'th', 'rug']
Sentiment parts for row 1570: ['th', 'de']
Sentiment parts for row 1571: ['nt', 'c']
Sentiment parts for row 1572: ['nt', 'jo', 'mental', 'c']
Sentiment parts for row 1573: ['life']
Sentiment parts for row 1574: ['dm', 'din', 'dinner', 'fo', 'th']
Sentiment parts for row 1575: ['sweet']
Sentiment parts for row 1576: ['sa', '45', 'hey', 'eye', 'yes', 'go', 'th']
Sentiment parts for row 1577: ['sa', 'private', 'cart', 'carter', 'youtube', 'th', 'pr', 'de']
Sentiment parts for row 1578: ['good', 'wow', 'g']
Sentiment parts for row 1579: ['gorgeous', 'g', 'day', 'rs']
Sentiment parts for row 1580: ['g', 'day', 'owe'

Sentiment parts for row 2294: []
Sentiment parts for row 2295: ['g']
Sentiment parts for row 2296: ['home', 'go', 'fo', 'un', 'th', 'pr']
Sentiment parts for row 2297: ['ed', 'till', 'fo', 'th']
Sentiment parts for row 2298: ['headache', 'ache', 'head', 'c']
Sentiment parts for row 2299: []
Sentiment parts for row 2300: []
Sentiment parts for row 2301: ['th']
Sentiment parts for row 2302: ['cry', 'crying', 'pain', 'painful', 'wat', 'c']
Sentiment parts for row 2303: ['fo', 'coffee', 'de']
Sentiment parts for row 2304: ['sucks', 'suck', 'nt', 'c']
Sentiment parts for row 2305: ['play', 'go', 'un', 'th', 'de']
Sentiment parts for row 2306: ['pr']
Sentiment parts for row 2307: ['empty', 'ed', 'cleaning', 'fo', 'th', 'de']
Sentiment parts for row 2308: ['sucks', 'suck', 'c']
Sentiment parts for row 2309: ['dead', 'rly']
Sentiment parts for row 2310: ['beautiful', 'g', 'day', 'rs']
Sentiment parts for row 2311: ['tell', 'th']
Sentiment parts for row 2312: ['upset', 'c']
Sentiment parts for 

Sentiment parts for row 3033: ['welcome', 'g', 'ever']
Sentiment parts for row 3034: ['sad']
Sentiment parts for row 3035: []
Sentiment parts for row 3036: ['wish', 'joy', 'g', 'day', 'ever', 'owe', 'however']
Sentiment parts for row 3037: ['sorry', 'im', 'oh', 'c']
Sentiment parts for row 3038: ['good', 'fun', 'funny', 'haha', 'g', 'aha']
Sentiment parts for row 3039: ['fo']
Sentiment parts for row 3040: ['hell', 'ate', 'c', 'na']
Sentiment parts for row 3041: ['th']
Sentiment parts for row 3042: ['awesome', 'watched']
Sentiment parts for row 3043: ['cant', 'nt', 'get home', 'c']
Sentiment parts for row 3044: ['love', 'welcome']
Sentiment parts for row 3045: []
Sentiment parts for row 3046: ['wake', 'breakfast', 'ah', 'th']
Sentiment parts for row 3047: ['thanks', 'thank', 'interesting', '3', 'g', 'day', 'rest', '33']
Sentiment parts for row 3048: ['ah', 'go', 'fo']
Sentiment parts for row 3049: ['tho', 'insurance', 'know', 'fo', 'un', 'th']
Sentiment parts for row 3050: ['g', 'mom', 

Sentiment parts for row 3922: ['glad', 'g']
Sentiment parts for row 3923: ['un', 'th']
Sentiment parts for row 3924: ['fo']
Sentiment parts for row 3925: ['sweet', 'sweetie', 'hugs', 'g', 'hug']
Sentiment parts for row 3926: ['g', 'u know']
Sentiment parts for row 3927: ['day', 'rs', 'luck']
Sentiment parts for row 3928: ['sick', 'c']
Sentiment parts for row 3929: ['stupid', 'headache', 'ache', 'stop', 'away', 'head', 'c']
Sentiment parts for row 3930: ['ed', 'r u', 'released', 'http', 'fo', 'un']
Sentiment parts for row 3931: []
Sentiment parts for row 3932: []
Sentiment parts for row 3933: ['sa', 'till', 'th']
Sentiment parts for row 3934: ['im', 'boo', 'even', 'half']
Sentiment parts for row 3935: []
Sentiment parts for row 3936: ['g', 'day', 'new', 'like']
Sentiment parts for row 3937: ['tired', 'sleep', 'every']
Sentiment parts for row 3938: ['annoying', 'boo', 'ew', 'iv', 'c']
Sentiment parts for row 3939: []
Sentiment parts for row 3940: []
Sentiment parts for row 3941: ['mornin

Sentiment parts for row 4653: ['nt', 'ew', 'c']
Sentiment parts for row 4654: ['great', 'g']
Sentiment parts for row 4655: ['de']
Sentiment parts for row 4656: ['din', 'go', 'fo', 'th']
Sentiment parts for row 4657: ['miss', 'im', 'c']
Sentiment parts for row 4658: []
Sentiment parts for row 4659: []
Sentiment parts for row 4660: ['sa', 'ed', 'th', 'par']
Sentiment parts for row 4661: ['ah', 'par']
Sentiment parts for row 4662: ['home', 'fo']
Sentiment parts for row 4663: ['ed', 'th']
Sentiment parts for row 4664: ['lol', 'th']
Sentiment parts for row 4665: ['happening', 'im']
Sentiment parts for row 4666: ['goin', 'ping', 'going', 'go', 'th']
Sentiment parts for row 4667: ['even', 'kicked', 'c']
Sentiment parts for row 4668: ['mid', 'de']
Sentiment parts for row 4669: ['workin', 'working weekend']
Sentiment parts for row 4670: ['good', 'g', 'day']
Sentiment parts for row 4671: ['play']
Sentiment parts for row 4672: ['new']
Sentiment parts for row 4673: []
Sentiment parts for row 4674:

Sentiment parts for row 5534: ['dd', 'din', 'word', 'th', 'de']
Sentiment parts for row 5535: ['nt', 'oh', 'c']
Sentiment parts for row 5536: ['waiting', 'go', 'th', 'de']
Sentiment parts for row 5537: ['sob', 'c']
Sentiment parts for row 5538: ['goin', 'movies', 'going', 'go', 'th']
Sentiment parts for row 5539: ['ants']
Sentiment parts for row 5540: ['ugh', 'nt', 'friday', 'c']
Sentiment parts for row 5541: []
Sentiment parts for row 5542: ['best', 'g', 'ty', 'rs', 'rock']
Sentiment parts for row 5543: ['g', 'ever']
Sentiment parts for row 5544: ['sick', 'even', 'away', 'passed away', 'c']
Sentiment parts for row 5545: ['http']
Sentiment parts for row 5546: ['nt', 'jo', 'entire', 'c']
Sentiment parts for row 5547: ['miss', 'missed', 'hot', 'c', 'maui']
Sentiment parts for row 5548: ['day', '4th', 'rs']
Sentiment parts for row 5549: ['good', 'g', 'ily', 'day', 'mi']
Sentiment parts for row 5550: ['thank']
Sentiment parts for row 5551: ['sorry', 'feel', 'mean', 'c']
Sentiment parts for

Sentiment parts for row 6366: ['miss', 'ate', 'na']
Sentiment parts for row 6367: ['tho', 'take', 'go', 'th']
Sentiment parts for row 6368: ['hope', 'better', 'g', 'day', 'rs']
Sentiment parts for row 6369: ['bummer', 'grr', 'grrr', 'grrrr']
Sentiment parts for row 6370: ['superstar', 'th']
Sentiment parts for row 6371: ['lol', 'home', 'watching', 'watchin', '6am', 'watch', 'de']
Sentiment parts for row 6372: ['ed', 'play', 'playing', 'th', 'par']
Sentiment parts for row 6373: ['ah', 'yeah', 'th', 'mid']
Sentiment parts for row 6374: ['ed', 'except', 'th']
Sentiment parts for row 6375: ['fun', 'funny', 'g', 'book', 'soo']
Sentiment parts for row 6376: ['love', 'lovely', 'g', 'ty']
Sentiment parts for row 6377: ['amazing', 'sounds', 'g']
Sentiment parts for row 6378: ['watching', 'hey', 'watchin', 'th', 'watch']
Sentiment parts for row 6379: ['dd']
Sentiment parts for row 6380: ['jealous', 'im', 'nt']
Sentiment parts for row 6381: ['hey']
Sentiment parts for row 6382: ['love', 'luv']
Se

Sentiment parts for row 7342: ['waiting', 'fo']
Sentiment parts for row 7343: ['sa', 'ed', 'th']
Sentiment parts for row 7344: ['ugh', 'dam', 'thought']
Sentiment parts for row 7345: ['annoying', 'dumb']
Sentiment parts for row 7346: ['sf']
Sentiment parts for row 7347: ['good', 'better', 'g', 'ty']
Sentiment parts for row 7348: ['tho', 'go', 'th']
Sentiment parts for row 7349: ['ed', 'wear', 'chan', 'th', 'remember', 'de']
Sentiment parts for row 7350: []
Sentiment parts for row 7351: ['die']
Sentiment parts for row 7352: ['ed', 'go']
Sentiment parts for row 7353: ['love', 'haha', 'g', 'aha']
Sentiment parts for row 7354: ['nt']
Sentiment parts for row 7355: ['happy', 'happy mother', 'mothers', 'happy mothers', 'happy mothers day', 'mom', 'day', 'rs', 'mi', 'mother']
Sentiment parts for row 7356: ['mmm', 'breakfast', 'fo', 'un']
Sentiment parts for row 7357: ['poor', 'already']
Sentiment parts for row 7358: ['ah', 'fo']
Sentiment parts for row 7359: []
Sentiment parts for row 7360: ['

Sentiment parts for row 8223: ['yup', 'th', 'de']
Sentiment parts for row 8224: ['bad', 'iv']
Sentiment parts for row 8225: ['boring', 'without', 'c']
Sentiment parts for row 8226: ['c']
Sentiment parts for row 8227: ['finger', 'nt', 'c']
Sentiment parts for row 8228: ['smile', 'g', 'smiles', 'mi']
Sentiment parts for row 8229: []
Sentiment parts for row 8230: ['g', 'night', 'rock', 'tool']
Sentiment parts for row 8231: ['th']
Sentiment parts for row 8232: ['airport', 'din', 'th']
Sentiment parts for row 8233: ['bad', 'heat', 'ate']
Sentiment parts for row 8234: ['happy', 'g', 'songs']
Sentiment parts for row 8235: ['amazing', 'true', 'g', 'tweetdeck']
Sentiment parts for row 8236: ['ed']
Sentiment parts for row 8237: ['interesting', 'g', 'would', 'rest']
Sentiment parts for row 8238: ['ah', 'go', 'th']
Sentiment parts for row 8239: []
Sentiment parts for row 8240: []
Sentiment parts for row 8241: ['good', 'g']
Sentiment parts for row 8242: ['feel', 'feeling', 'nt', 'feelin']
Sentiment

Sentiment parts for row 8959: ['g', 'mom', 'gift']
Sentiment parts for row 8960: ['ate']
Sentiment parts for row 8961: []
Sentiment parts for row 8962: ['nice', 'amaze', 'g', 'yes', 'aha']
Sentiment parts for row 8963: ['good', 'g', 'aww', 'mom', 'awww', 'mi']
Sentiment parts for row 8964: ['mmm', 'phone', 'phones', 'sp']
Sentiment parts for row 8965: ['hate', 'sucks', 'suck', 'nt', 'ate', 'c']
Sentiment parts for row 8966: ['nt']
Sentiment parts for row 8967: ['ugh', 'thought', 'every', 'c']
Sentiment parts for row 8968: ['glad', 'g', 'day', 'rest']
Sentiment parts for row 8969: ['dont like', 'nt', 'c']
Sentiment parts for row 8970: []
Sentiment parts for row 8971: ['amazing', 'thanx', 'g', 'day', 'free stuff', 'cn']
Sentiment parts for row 8972: ['sorry', 'ugh', 'im', 'seem', 'seems', '24']
Sentiment parts for row 8973: ['chan']
Sentiment parts for row 8974: ['sorry', 'c']
Sentiment parts for row 8975: []
Sentiment parts for row 8976: ['ugh', 'bummed', 'sit', 'c']
Sentiment parts for

Sentiment parts for row 9853: ['hope']
Sentiment parts for row 9854: []
Sentiment parts for row 9855: ['g']
Sentiment parts for row 9856: ['fo', 'un']
Sentiment parts for row 9857: ['miss', 'missed', 'late', 'ate', 'c']
Sentiment parts for row 9858: ['goin', 'tell', 'going', 'go', 'fo', 'th']
Sentiment parts for row 9859: ['g']
Sentiment parts for row 9860: ['double', 'th']
Sentiment parts for row 9861: ['hard', 'lose', 'c']
Sentiment parts for row 9862: ['forward', 'g', 'owe', 'mi']
Sentiment parts for row 9863: []
Sentiment parts for row 9864: ['im', 'c']
Sentiment parts for row 9865: ['good', 'thank', 'better', 'g', 'day', 'always', 'mornin', 'greetings']
Sentiment parts for row 9866: ['sa', 'ah', 'ask']
Sentiment parts for row 9867: ['eye', 'th']
Sentiment parts for row 9868: ['love', 'gives', 'g', 'advantage']
Sentiment parts for row 9869: ['read', 'go', 'th']
Sentiment parts for row 9870: ['feel', 'nt', 'iv', 'c']
Sentiment parts for row 9871: ['home', 'go', 'un']
Sentiment parts

Sentiment parts for row 10784: ['go']
Sentiment parts for row 10785: ['go', 'fo']
Sentiment parts for row 10786: ['hope']
Sentiment parts for row 10787: ['fun', 'like']
Sentiment parts for row 10788: ['firefox', 'go', 'fo', 'th']
Sentiment parts for row 10790: ['ah', 'http']
Sentiment parts for row 10791: []
Sentiment parts for row 10792: ['ed', 'de']
Sentiment parts for row 10793: ['ed']
Sentiment parts for row 10794: ['read', 'http', 'fo']
Sentiment parts for row 10795: ['better', 'g']
Sentiment parts for row 10796: []
Sentiment parts for row 10797: ['miss', 'missed']
Sentiment parts for row 10798: ['sorry']
Sentiment parts for row 10799: ['cool', 'g', 'john']
Sentiment parts for row 10800: ['love', 'lovely', 'g', 'day']
Sentiment parts for row 10801: ['sad', 'nt', 'find', 'ppl', 'c']
Sentiment parts for row 10802: []
Sentiment parts for row 10803: ['ed', 'th']
Sentiment parts for row 10804: []
Sentiment parts for row 10805: []
Sentiment parts for row 10806: []
Sentiment parts for ro

Sentiment parts for row 11625: ['go', 'th']
Sentiment parts for row 11626: ['luck', 'new']
Sentiment parts for row 11627: ['sad', 'late', 'ate', 'c']
Sentiment parts for row 11628: ['tho', 'sa', 'although', 'ah', 'yeah', 'th', 'pr']
Sentiment parts for row 11629: ['bored', 'really', 'c']
Sentiment parts for row 11630: ['nt', 'roads', 'c']
Sentiment parts for row 11631: ['awesome', 'win', 'g', 'lol']
Sentiment parts for row 11632: ['din', 'un', 'th']
Sentiment parts for row 11633: ['play', 'playing', 'th']
Sentiment parts for row 11634: ['th']
Sentiment parts for row 11635: ['sa', 'morning', 'hey', 'th', 'coffee']
Sentiment parts for row 11636: ['hope', 'hopefully', 'g', 'know right']
Sentiment parts for row 11637: ['http', 'de']
Sentiment parts for row 11638: ['miss', 'c', 'na']
Sentiment parts for row 11639: ['boo', 'hot', 'totally', 'c']
Sentiment parts for row 11640: ['fun']
Sentiment parts for row 11641: ['love', 'thanks', 'good', 'thank', 'g']
Sentiment parts for row 11642: ['ed',

Sentiment parts for row 12606: ['g', 'rs']
Sentiment parts for row 12607: ['wrong', 'ache', 'sleep', 'c']
Sentiment parts for row 12608: ['g', 'lol']
Sentiment parts for row 12609: ['th']
Sentiment parts for row 12610: ['miss', 'missed']
Sentiment parts for row 12611: ['home', 'go']
Sentiment parts for row 12612: ['ed', 'morning', 'hey', 'th']
Sentiment parts for row 12613: ['ed', 'episodes', 'http', 'lab', 'fo', 'th', 'watch', 'de']
Sentiment parts for row 12614: ['workin', 'phone', 'chan']
Sentiment parts for row 12615: ['surely', 'http', 'un', 'pr']
Sentiment parts for row 12616: ['fantastic', 'day', 'fan']
Sentiment parts for row 12617: []
Sentiment parts for row 12618: ['g', 'soo']
Sentiment parts for row 12619: ['love', 'lovely', 'yay', 'g', 'night']
Sentiment parts for row 12620: ['great', 'hope', 'g', 'day', 'ever', 'everyone']
Sentiment parts for row 12621: ['boring', 'problem']
Sentiment parts for row 12622: []
Sentiment parts for row 12623: ['g', 'mom']
Sentiment parts for r

Sentiment parts for row 13391: ['happy', '3']
Sentiment parts for row 13392: ['sick', 'c']
Sentiment parts for row 13393: ['th', 'de']
Sentiment parts for row 13394: ['play', 'playing']
Sentiment parts for row 13395: ['th']
Sentiment parts for row 13396: ['broke']
Sentiment parts for row 13397: ['nt', 'c']
Sentiment parts for row 13398: ['love', 'great', 'lovely', 'yummy', 'yum', 'g', 'lol']
Sentiment parts for row 13399: ['love', 'g', 'ever', 'night']
Sentiment parts for row 13400: ['th', 'de']
Sentiment parts for row 13401: ['nt', 'c']
Sentiment parts for row 13402: ['follow', 'dm', 'dd', 'fo', 'th', 'de']
Sentiment parts for row 13403: ['till', 'go']
Sentiment parts for row 13404: ['thanks', 'thank', 'cool']
Sentiment parts for row 13405: ['good', 'good morning', 'g', 'mornin']
Sentiment parts for row 13406: ['unfortunately', 'ate', 'c', 'na']
Sentiment parts for row 13407: ['ed', 'din', 'go', 'th', 'worked']
Sentiment parts for row 13408: ['ed', 'ping', 'tire']
Sentiment parts for 

Sentiment parts for row 14212: ['take', 'fo', 'th']
Sentiment parts for row 14213: ['enjoy', 'joy', 'g', 'rs', 'like']
Sentiment parts for row 14214: ['shattered', 'rly', 'knackered', 'c', 'na']
Sentiment parts for row 14215: ['ed', 'goin', 'going', 'go', 'fo', 'th']
Sentiment parts for row 14216: ['lol', 'din', 'th']
Sentiment parts for row 14217: ['disappointed', 'nt']
Sentiment parts for row 14218: ['workin', 'happen', 'th']
Sentiment parts for row 14219: ['ed', 'go', 'way']
Sentiment parts for row 14220: ['stupid', 'fail', 'pain', 'stomach', 'nt', 'fails', 'c']
Sentiment parts for row 14221: []
Sentiment parts for row 14222: ['g']
Sentiment parts for row 14223: ['hurts', 'hurt', 'tummy', 'late', 'ate', 'c']
Sentiment parts for row 14224: ['ed', 'phone', 'story', 'except', 'th']
Sentiment parts for row 14225: ['de']
Sentiment parts for row 14226: ['miss', 'c']
Sentiment parts for row 14227: ['ed', 'grow', 'ants', 'din', 'th', 'flo']
Sentiment parts for row 14228: []
Sentiment parts 

Sentiment parts for row 15177: ['life', 'iv']
Sentiment parts for row 15178: ['haha', 'g', 'aww', 'lol', 'awww', 'pic', 'rs', 'aha']
Sentiment parts for row 15179: ['thank', 'g', 'day', 'thank god', 'mi', 'go back']
Sentiment parts for row 15180: ['oh', 'c']
Sentiment parts for row 15181: ['good', 'nice', 'bless', 'g', 'rs']
Sentiment parts for row 15182: ['happy', '3', 'happy mother', 'mothers', 'happy mothers', 'happy mothers day', 'day', '333', 'rs', 'mother', '33']
Sentiment parts for row 15183: ['forget']
Sentiment parts for row 15184: []
Sentiment parts for row 15185: ['love', 'would']
Sentiment parts for row 15186: ['thanks', 'thank', 'worth', 'blast', 'g', 'day', 'mi']
Sentiment parts for row 15187: ['th']
Sentiment parts for row 15188: ['sa', 'last day', 'un', 'th']
Sentiment parts for row 15189: ['sa', 'ed', 'hey', 'th']
Sentiment parts for row 15190: ['ed', 'chan', 'th', 'par']
Sentiment parts for row 15191: ['sa', 'r u', 'update', 'th', 'flo']
Sentiment parts for row 15192:

Sentiment parts for row 15935: ['bad', 'ugh', 'worried', 'hospital', 'im', 'thought', 'really']
Sentiment parts for row 15936: ['tired', 'reason']
Sentiment parts for row 15937: ['good', 'g', 'ever']
Sentiment parts for row 15938: []
Sentiment parts for row 15939: ['fo', 'th']
Sentiment parts for row 15940: ['already', 'c']
Sentiment parts for row 15941: ['wrong', 'c']
Sentiment parts for row 15942: ['miss', 'missing', 'missin', 'c']
Sentiment parts for row 15943: ['watching', 'watchin', 'watch']
Sentiment parts for row 15944: ['forgot', 'nt', 'iv', 'c']
Sentiment parts for row 15945: ['love', 'nite']
Sentiment parts for row 15946: ['im', 'wasting', 'c']
Sentiment parts for row 15947: ['mmm']
Sentiment parts for row 15948: ['know', 'fo', 'th', 'way']
Sentiment parts for row 15949: ['ever', 'owe', 'however']
Sentiment parts for row 15950: ['ed', 'th', 'brown']
Sentiment parts for row 15951: ['good', 'hope', 'g', 'day']
Sentiment parts for row 15952: ['hot', 'office', 'c']
Sentiment part

Sentiment parts for row 17004: ['ed', 'din', 'par', 'de']
Sentiment parts for row 17005: []
Sentiment parts for row 17006: []
Sentiment parts for row 17007: ['miss', 'nt', 'kid', 'ppl']
Sentiment parts for row 17008: ['g']
Sentiment parts for row 17009: ['joy', 'g', 'ty', 'owe', 'rs', 'relief']
Sentiment parts for row 17010: ['tho', 'workin', 'idol', 'fo', 'th']
Sentiment parts for row 17011: ['awsome', 'mi', 'like']
Sentiment parts for row 17012: ['fair', 'longer', 'problem', 'rly', 'c', 'na']
Sentiment parts for row 17013: ['bunny']
Sentiment parts for row 17014: ['g']
Sentiment parts for row 17015: ['gutted', 'c']
Sentiment parts for row 17016: ['sa', 'th']
Sentiment parts for row 17017: ['till', 'home', 'oooo', 'un']
Sentiment parts for row 17018: ['wake', 'afternoon', 'th']
Sentiment parts for row 17019: ['3', 'day', 'mi', 'apple']
Sentiment parts for row 17020: ['lost', 'c']
Sentiment parts for row 17021: ['fav', 'g', 'rs']
Sentiment parts for row 17022: ['fo', 'de']
Sentiment pa

Sentiment parts for row 17983: ['great', 'g', 'ty', 'live']
Sentiment parts for row 17984: ['th', 'de']
Sentiment parts for row 17985: ['g', 'night', 'pic']
Sentiment parts for row 17986: ['buy', 'breakfast', 'go', 'fo', 'th']
Sentiment parts for row 17987: ['step', 'th', 'way']
Sentiment parts for row 17988: ['star', 'woo', 'g']
Sentiment parts for row 17989: ['ed', 'tour', 'go']
Sentiment parts for row 17990: ['c']
Sentiment parts for row 17991: ['wish', 'wishes', 'g']
Sentiment parts for row 17992: ['fo']
Sentiment parts for row 17993: ['fo', 'th']
Sentiment parts for row 17994: ['late', 'nobody', 'ate', 'really', 'c']
Sentiment parts for row 17995: ['fml']
Sentiment parts for row 17996: ['c']
Sentiment parts for row 17997: ['welcome']
Sentiment parts for row 17998: ['g', 'ever', 'truly']
Sentiment parts for row 17999: ['din', 'loading', 'instant']
Sentiment parts for row 18000: []
Sentiment parts for row 18001: ['good', 'g']
Sentiment parts for row 18002: ['changing', 'chan', 'th']

Sentiment parts for row 18781: ['awesome', 'g', 'looks', 'fan']
Sentiment parts for row 18782: ['gloomy', 'friday', 'c']
Sentiment parts for row 18783: ['ed', 'read', 'fo', 'th', 'de']
Sentiment parts for row 18784: ['tho', 'windows', 'sunshine', 'un', 'th']
Sentiment parts for row 18785: ['read', 'painting', 'th']
Sentiment parts for row 18786: ['cant', 'nt', 'c']
Sentiment parts for row 18787: []
Sentiment parts for row 18788: ['hope', 'g', 'book', 'correct']
Sentiment parts for row 18789: ['fo', 'th']
Sentiment parts for row 18790: ['hurt', 'nt']
Sentiment parts for row 18791: ['pretty', 'ty', 'pic']
Sentiment parts for row 18792: ['success', 'g', 'day', 'successful']
Sentiment parts for row 18793: ['nt', 'c']
Sentiment parts for row 18794: ['go']
Sentiment parts for row 18795: ['ed', 'morning', 'ah', 'yeah', 'th']
Sentiment parts for row 18796: ['workin', 'un', 'th', 'de']
Sentiment parts for row 18797: ['heart', 'lose', 'surgery', 'c']
Sentiment parts for row 18798: ['stupid', 'ug

Sentiment parts for row 19526: ['sit', 'nt', 'never']
Sentiment parts for row 19527: ['ed', 'dd', 'way', 'de']
Sentiment parts for row 19528: ['ed', 'http', 'fo', 'th']
Sentiment parts for row 19529: ['stars', 'ed', 'de']
Sentiment parts for row 19530: ['hey', 'know']
Sentiment parts for row 19531: ['ment', 'th']
Sentiment parts for row 19532: ['sucks', 'suck', 'stressed', 'oh', 'stress', 'c']
Sentiment parts for row 19533: ['good', 'happy', 'hope', 'best', 'g', 'day', 'birthday', 'rs']
Sentiment parts for row 19534: ['jo', 'man', 'c']
Sentiment parts for row 19535: ['g']
Sentiment parts for row 19536: ['nice', 'hehe']
Sentiment parts for row 19537: ['g']
Sentiment parts for row 19538: ['ed', 'ah', 'yes', 'go', 'un', 'th', 'de']
Sentiment parts for row 19539: ['thanks', 'thank', 'win', 'rs']
Sentiment parts for row 19540: ['tho', 'th']
Sentiment parts for row 19541: ['play', 'playing', 'http', 'pr']
Sentiment parts for row 19542: ['wait']
Sentiment parts for row 19543: ['im', 'c']
Sent

Sentiment parts for row 20547: ['g', 'please', 'photos']
Sentiment parts for row 20548: ['ate', 'c']
Sentiment parts for row 20549: ['hurt', 'feel', 'feeling', 'feelin']
Sentiment parts for row 20550: ['headache', 'ache', 'head', 'c']
Sentiment parts for row 20551: ['home', 'sunshine', 'un', 'th', 'way']
Sentiment parts for row 20552: ['ah']
Sentiment parts for row 20553: ['ed', 'read', 'go', 'tire', 'un', 'th']
Sentiment parts for row 20554: ['nt', 'c']
Sentiment parts for row 20555: ['drink', 'drinking', 'http', 'fo', 'th']
Sentiment parts for row 20556: []
Sentiment parts for row 20557: ['bad', 'cant', 'nt', 'really', 'left', 'thumb', 'c']
Sentiment parts for row 20558: ['th', 'de']
Sentiment parts for row 20559: []
Sentiment parts for row 20560: ['ed', 'breakfast', 'take', 'fo', 'th', 'way']
Sentiment parts for row 20561: ['nothin', 'th']
Sentiment parts for row 20562: ['ed', 'hey', 'ah', 'play', 'know', 'un', 'th']
Sentiment parts for row 20563: ['watching', 'watchin', 'go', 'watc

Sentiment parts for row 21426: ['de']
Sentiment parts for row 21427: []
Sentiment parts for row 21428: ['ruined', 'late', 'im', 'ate', 'man', 'ew', 'c']
Sentiment parts for row 21429: ['ed', 'know', 'un', 'th']
Sentiment parts for row 21430: ['th']
Sentiment parts for row 21431: ['headache', 'ache', 'head', 'house', 'c']
Sentiment parts for row 21432: ['mom']
Sentiment parts for row 21433: ['stay']
Sentiment parts for row 21434: ['love', 'hope', 'g', 'soo']
Sentiment parts for row 21435: ['g', 'day', 'inside']
Sentiment parts for row 21436: ['fo', 'th']
Sentiment parts for row 21437: ['boo', 'c']
Sentiment parts for row 21438: ['g', 'ever', '4th', 'mi']
Sentiment parts for row 21439: ['good', 'g']
Sentiment parts for row 21440: ['dd']
Sentiment parts for row 21441: ['love', 'great', 'best', '3', 'greatest', 'g', 'day', 'ever', 'bestie']
Sentiment parts for row 21442: ['ed', 'din', 'un', 'th', 'filled']
Sentiment parts for row 21443: ['sa', 'ah']
Sentiment parts for row 21444: ['sandwic

Sentiment parts for row 22375: ['sorry', 'forgot']
Sentiment parts for row 22376: ['hope']
Sentiment parts for row 22377: ['ty', 'mi', 'like']
Sentiment parts for row 22378: []
Sentiment parts for row 22379: ['chan', 'know']
Sentiment parts for row 22380: []
Sentiment parts for row 22381: ['tho', 'ed', 'go', 'th', 'de']
Sentiment parts for row 22382: ['oh', 'jo', 'turned']
Sentiment parts for row 22383: ['g', 'night', 'mi']
Sentiment parts for row 22384: ['good', 'g', 'day']
Sentiment parts for row 22385: ['ed', 'hey', 'th']
Sentiment parts for row 22386: ['lol', 'happen', 'th']
Sentiment parts for row 22387: ['sorry']
Sentiment parts for row 22388: ['boo', 'pitch', 'c']
Sentiment parts for row 22389: ['miss', 'missed', 'gutted', 'nt']
Sentiment parts for row 22390: ['ed', 'follow', 'fo', 'th']
Sentiment parts for row 22391: ['cant', 'late', 'nt', 'ate', 'c']
Sentiment parts for row 22392: ['kill', 'nt', 'kid', 'police', 'c']
Sentiment parts for row 22393: ['messing', 'c']
Sentiment pa

Sentiment parts for row 23269: ['g', 'curious', 'yes', 'treat', 'like']
Sentiment parts for row 23270: ['airport', 'ed', 'closed', 'hey', 'th']
Sentiment parts for row 23271: ['headache', 'ugh', 'ache', 'ughh', 'head', 'ughhh', 'c']
Sentiment parts for row 23272: ['losing', 'c']
Sentiment parts for row 23273: ['hello', 'goin', 'going', 'go']
Sentiment parts for row 23274: ['sa']
Sentiment parts for row 23275: ['excited', 'best', 'day']
Sentiment parts for row 23276: ['beautiful', 'wait', 'g', 'lol', 'day', 'guys', 'rs', 'aha']
Sentiment parts for row 23277: ['till', 'morning']
Sentiment parts for row 23278: ['tho', 'ed', 'less', 'go', 'fo', 'th']
Sentiment parts for row 23279: ['stupid', 'boo', 'nt', 'wont', 'facebook', 'c']
Sentiment parts for row 23280: ['g', 'grad', 'aha']
Sentiment parts for row 23281: ['till', 'stay', 'fo', 'th']
Sentiment parts for row 23282: []
Sentiment parts for row 23283: ['din', 'th']
Sentiment parts for row 23284: ['3', 'g', 'gooood']
Sentiment parts for ro

Sentiment parts for row 24201: ['workin']
Sentiment parts for row 24202: ['till', 'th']
Sentiment parts for row 24203: ['love', 'wait']
Sentiment parts for row 24204: ['gorgeous', 'two', 'g']
Sentiment parts for row 24205: ['g', 'lol', 'mornin']
Sentiment parts for row 24206: ['mi']
Sentiment parts for row 24207: ['good', 'g', 'really good', 'recommend']
Sentiment parts for row 24208: ['omg']
Sentiment parts for row 24209: ['3']
Sentiment parts for row 24210: []
Sentiment parts for row 24211: ['dear', 'de']
Sentiment parts for row 24212: ['hope', 'better', 'feel better']
Sentiment parts for row 24213: ['tho', 'ed', 'released', 'go', 'fo', 'th']
Sentiment parts for row 24214: ['sa', 'hey', 'ah', 'fo', 'th']
Sentiment parts for row 24215: ['great', 'g', 'ever', 'night']
Sentiment parts for row 24216: ['th', 'par']
Sentiment parts for row 24217: ['great', 'g', 'day']
Sentiment parts for row 24218: ['ants', 'go']
Sentiment parts for row 24219: ['g', 'ooh', 'mi']
Sentiment parts for row 242

Sentiment parts for row 25198: []
Sentiment parts for row 25199: []
Sentiment parts for row 25200: ['hide', 'chan', 'http', 'th', 'de']
Sentiment parts for row 25201: ['g']
Sentiment parts for row 25202: ['die', 'kill']
Sentiment parts for row 25203: ['love', 'loved', 'loves', 'perfect', 'g', 'mom', 'owe', 'rs', 'new']
Sentiment parts for row 25204: ['3', 'day', 'rs']
Sentiment parts for row 25205: ['http', 'th']
Sentiment parts for row 25206: []
Sentiment parts for row 25207: ['ah', 'yeah']
Sentiment parts for row 25208: ['learning', 'fo']
Sentiment parts for row 25209: ['g', 'night', 'rs']
Sentiment parts for row 25210: ['better', 'would', 'better believe', 'mi']
Sentiment parts for row 25211: ['g', 'accomplished']
Sentiment parts for row 25212: ['ed', 'de']
Sentiment parts for row 25213: ['fo', 'th', 'de']
Sentiment parts for row 25214: ['live', 'new', 'like']
Sentiment parts for row 25215: ['goin', 'going', 'go']
Sentiment parts for row 25216: ['thanks', 'thank', 'mi']
Sentiment pa

Sentiment parts for row 26070: ['ed', 'follow', 'hey', 'fo', 'th', 'de']
Sentiment parts for row 26071: []
Sentiment parts for row 26072: ['c']
Sentiment parts for row 26073: ['ed', 'happened', 'happen', 'sp', 'th']
Sentiment parts for row 26074: ['mi', 'like']
Sentiment parts for row 26075: ['sad', 'c']
Sentiment parts for row 26076: ['im', 'stop', 'c']
Sentiment parts for row 26077: ['nt']
Sentiment parts for row 26078: ['hey']
Sentiment parts for row 26079: ['un', 'th', 'watch']
Sentiment parts for row 26080: ['dead', 'officially', 'fish', 'c']
Sentiment parts for row 26081: ['bored']
Sentiment parts for row 26082: ['th']
Sentiment parts for row 26083: ['upset', '24']
Sentiment parts for row 26084: ['http']
Sentiment parts for row 26085: ['loving', 'lovin', 'g']
Sentiment parts for row 26086: ['sad']
Sentiment parts for row 26087: ['nervous', 'im', 'nt']
Sentiment parts for row 26088: ['poor', 'dog', 'rly', 'c']
Sentiment parts for row 26089: ['tho', 'indoors', 'th', 'pr']
Sentiment

Sentiment parts for row 27024: ['awesome', 'finally', 'g']
Sentiment parts for row 27025: ['happy', 'happy mother', 'mothers', 'happy mothers', 'g', 'happy mothers day', 'day', 'rs', 'mother']
Sentiment parts for row 27026: ['slow', 'losing', 'im']
Sentiment parts for row 27027: []
Sentiment parts for row 27028: ['nachos', 'th', 'de']
Sentiment parts for row 27029: ['wait', '3', 'rs']
Sentiment parts for row 27030: ['g', 'day']
Sentiment parts for row 27031: ['sick', 'im', 'nt', 'itchy', 'half', 'rly', 'c']
Sentiment parts for row 27032: ['g', 'aww']
Sentiment parts for row 27033: ['worry', 'day']
Sentiment parts for row 27034: ['training', 'raining', 'http', 'mile', 'fo', 'un', 'th']
Sentiment parts for row 27035: ['sa', 'watching', 'watchin', 'watch']
Sentiment parts for row 27036: ['play', 'http', 'fo', 'un']
Sentiment parts for row 27037: ['watching', 'watchin', 'watch']
Sentiment parts for row 27038: ['welcome', 'wait']
Sentiment parts for row 27039: ['bad', 'feel', 'feeling', 'wa

In [14]:
#Applying preprocessing function on the text column
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# Converting sentiments to numerical form
sentiment_mapping = {'neutral': 0, 'positive': 1, 'negative': -1}
test_df['sentiment'] = test_df['sentiment'].map(sentiment_mapping)
print("Number of rows before dropna:", train_df.shape[0])

test_df.dropna(inplace=True)
print("Number of rows after dropna:", train_df.shape[0])

# Print the preprocessed test dataset
print(test_df.head())

Number of rows before dropna: 27480
Number of rows after dropna: 27480
       textID                                               text  sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh          0   
1  96d74cb729   Shanghai is also really exciting (precisely -...          1   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...         -1   
3  01082688c6                                        happy bday!          1   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!          1   

                                        cleaned_text  
0            last session day http twitpic com 67ezh  
1  shanghai also really exciting precisely skyscr...  
2  recession hit veronique branquinho quit compan...  
3                                         happy bday  
4                        http twitpic com 4w75p like  


In [15]:
# Transform the preprocessed text data into feature vectors
X_test = vectorizer.transform(test_df['cleaned_text']) 

# Use the trained model to predict the sentiment of the test dataset
y_pred = log_reg.predict(X_test)  # Apply the trained model to the test dataset


# Print the predicted sentiment labels for the test dataset
print(y_pred)

[ 0  1 -1 ...  0  1  0]


In [16]:
# Create an empty "selected_text" column in the test_df DataFrame
test_df['selected_text'] = ''

top_features_per_sentiment = {
    -1: feature_importance_df.loc[0].nlargest(num_top_features).index.tolist(),
    0: feature_importance_df.loc[1].nlargest(num_top_features).index.tolist(),
    1: feature_importance_df.loc[2].nlargest(num_top_features).index.tolist()
}

for index, row in test_df.iterrows():
    text = row['text']
    sentiment_label = int(row['sentiment'])  # Convert sentiment label to integer
    
    if sentiment_label in top_features_per_sentiment:
        top_features = top_features_per_sentiment[sentiment_label]  # All features for the sentiment label
        sentiment_parts = [feature for feature in top_features if feature in text]
    else:
        sentiment_parts = []  # No top features available for the sentiment label
    
    # Assign the sentiment parts to the "selected_text" column
    test_df.loc[index, 'selected_text'] = ' '.join(sentiment_parts)

# Create the submission DataFrame with the desired columns
submission_df = test_df[['textID', 'text', 'selected_text', 'sentiment']]

In [17]:
# Map back to original labels
sentiment_mapping = {0: 'neutral', 1: 'positive', -1: 'negative'}
submission_df.loc[:, 'sentiment'] = submission_df['sentiment'].map(sentiment_mapping)
print(submission_df)

          textID                                               text  \
0     f87dea47db  Last session of the day  http://twitpic.com/67ezh   
1     96d74cb729   Shanghai is also really exciting (precisely -...   
2     eee518ae67  Recession hit Veronique Branquinho, she has to...   
3     01082688c6                                        happy bday!   
4     33987a8ee5             http://twitpic.com/4w75p - I like it!!   
...          ...                                                ...   
3529  e5f0e6ef4b  its at 3 am, im very tired but i can`t sleep  ...   
3530  416863ce47  All alone in this old house again.  Thanks for...   
3531  6332da480c   I know what you mean. My little dog is sinkin...   
3532  df1baec676  _sutra what is your next youtube video gonna b...   
3533  469e15c5a8   http://twitpic.com/4woj2 - omgssh  ang cute n...   

         selected_text sentiment  
0              http th   neutral  
1        exciting g rs  positive  
2              shame c  negative  
3      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df.loc[:, 'sentiment'] = submission_df['sentiment'].map(sentiment_mapping)


In [18]:
submission_df.to_csv('submission.csv', index=False)