In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score)

In [3]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
#Importing the datasets
# Import the training dataset with only the important colums
train_df = pd.read_csv("datasets/train.csv")

# Column 3 is the column of interest in the testing set
test_df = pd.read_csv("datasets/test.csv")

In [5]:
#Explore train dataset
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [6]:
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [7]:
# Cleaning the dataset

def clean_text(text):
    if isinstance(text, str):
        # Remove links and non-alphanumeric characters, and convert to lowercase
        text = re.sub(r'http\S+|www\S+|https\S+', '', text.lower())
        text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

        # Tokenize the text into individual words
        words = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        # Join the words back into a single string
        clean_text = ' '.join(words)

        return clean_text


In [8]:
# Applying preprocessing function on the text column of the training dataset
train_df['cleaned_text'] = train_df['text'].apply(clean_text)

# Converting sentiments to numerical form
sentiment_mapping = {'neutral': 0, 'positive': 1, 'negative': -1}
train_df['sentiment'] = train_df['sentiment'].map(sentiment_mapping)
print("Number of rows before dropna:", train_df.shape[0])

# Dropping rows with missing values
train_df.dropna(inplace=True)
print("Number of rows after dropna:", train_df.shape[0])

# Print the preprocessed train dataset
print(train_df.head())

Number of rows before dropna: 27481
Number of rows after dropna: 27480
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text  sentiment  \
0  I`d have responded, if I were going          0   
1                             Sooo SAD         -1   
2                          bullying me         -1   
3                       leave me alone         -1   
4                        Sons of ****,         -1   

                       cleaned_text  
0                   responded going  
1           sooo sad miss san diego  
2                     boss bullying  
3             interview leave alone  
4  sons put

In [9]:
# Splitting the training dataset into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(train_df.cleaned_text, train_df.sentiment, test_size=0.2, random_state=24)


In [10]:
# Initialising the TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)

vectorizer.fit(list(X_train) + list(X_test))
X_train_vectorizer =  vectorizer.transform(X_train) 
X_test_vectorizer = vectorizer.transform(X_test)




In [11]:
log_reg = LogisticRegression(random_state=0, multi_class='multinomial', max_iter=1000)
log_reg.fit(X_train_vectorizer, y_train)

In [12]:
val_pred = log_reg.predict(X_test_vectorizer)
print("The accurary of logistic regression is",accuracy_score(list(y_test), val_pred))

The accurary of logistic regression is 0.6848617176128093


In [13]:
# Finding out which TF-IDF features contribute the most to determining the sentiment label
feature_names = vectorizer.get_feature_names_out()
# Coefficients indicate the importance of each feature in the classification
feature_importance = log_reg.coef_
feature_importance_df = pd.DataFrame(feature_importance, columns=feature_names)
print(feature_importance_df)

          0        00       000        01        03        04        05  \
0 -0.029928  0.536696 -0.226815  0.062376  0.254931  0.227530  0.138556   
1  0.153810 -0.099758 -0.069210  0.080082  0.043592 -0.164294 -0.146658   
2 -0.123882 -0.436938  0.296025 -0.142458 -0.298522 -0.063236  0.008101   

         06        07        08  ...        ze   zealand      zero       zoe  \
0  0.313694 -0.118439 -0.247789  ...  0.098485  0.003092 -0.380616 -0.182633   
1 -0.034245  0.251853  0.206534  ... -0.178707 -0.140191  0.384439  0.235802   
2 -0.279449 -0.133413  0.041255  ...  0.080222  0.137099 -0.003823 -0.053169   

     zombie   zombies      zone      zoo      zulu      zzzz  
0  0.099316  0.071882  0.012219 -0.33609 -0.310911  0.010866  
1 -0.035414 -0.071920  0.272654  0.25814 -0.006256 -0.099284  
2 -0.063902  0.000038 -0.284874  0.07795  0.317167  0.088418  

[3 rows x 12560 columns]


In [14]:
num_top_features = 500

# Create a dictionary to store the top features for each sentiment label
top_features_per_sentiment = {
    -1: feature_importance_df.loc[0].nlargest(num_top_features).index.tolist(),
    0: feature_importance_df.loc[1].nlargest(num_top_features).index.tolist(),
    1: feature_importance_df.loc[2].nlargest(num_top_features).index.tolist()
}

for index, row in train_df.iterrows():
    text = row['text']
    sentiment_label = int(row['sentiment'])  # Convert sentiment label to integer
    
    if sentiment_label in top_features_per_sentiment:
        top_features = top_features_per_sentiment[sentiment_label]  # All features for the sentiment label
        sentiment_parts = [feature for feature in top_features if feature in text]
    else:
        sentiment_parts = []  # No top features available for the sentiment label
    

In [15]:
#Applying preprocessing function on the text column
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# Converting sentiments to numerical form
sentiment_mapping = {'neutral': 0, 'positive': 1, 'negative': -1}
test_df['sentiment'] = test_df['sentiment'].map(sentiment_mapping)
print("Number of rows before dropna:", train_df.shape[0])

test_df.dropna(inplace=True)
print("Number of rows after dropna:", train_df.shape[0])

# Print the preprocessed test dataset
print(test_df.head())

Number of rows before dropna: 27480
Number of rows after dropna: 27480
       textID                                               text  sentiment  \
0  f87dea47db  Last session of the day  http://twitpic.com/67ezh          0   
1  96d74cb729   Shanghai is also really exciting (precisely -...          1   
2  eee518ae67  Recession hit Veronique Branquinho, she has to...         -1   
3  01082688c6                                        happy bday!          1   
4  33987a8ee5             http://twitpic.com/4w75p - I like it!!          1   

                                        cleaned_text  
0                                   last session day  
1  shanghai also really exciting precisely skyscr...  
2  recession hit veronique branquinho quit compan...  
3                                         happy bday  
4                                               like  


In [16]:
# Transform the preprocessed text data into feature vectors
X_test = vectorizer.transform(test_df['cleaned_text']) 

# Use the trained model to predict the sentiment of the test dataset
y_pred = log_reg.predict(X_test)  # Apply the trained model to the test dataset


# Print the predicted sentiment labels for the test dataset
print(y_pred)

[ 0  1 -1 ...  0  1  0]


In [17]:
# Create an empty "selected_text" column in the test_df DataFrame
test_df['selected_text'] = ''

top_features_per_sentiment = {
    -1: feature_importance_df.loc[0].nlargest(num_top_features).index.tolist(),
    0: feature_importance_df.loc[1].nlargest(num_top_features).index.tolist(),
    1: feature_importance_df.loc[2].nlargest(num_top_features).index.tolist()
}

for index, row in test_df.iterrows():
    text = row['text']
    sentiment_label = int(row['sentiment'])  # Convert sentiment label to integer
    
    if sentiment_label in top_features_per_sentiment:
        top_features = top_features_per_sentiment[sentiment_label]  # All features for the sentiment label
        sentiment_parts = [feature for feature in top_features if feature in text]
    else:
        sentiment_parts = []  # No top features available for the sentiment label
    
    # Assign the sentiment parts to the "selected_text" column
    test_df.loc[index, 'selected_text'] = ' '.join(sentiment_parts)

# Create the submission DataFrame with the desired columns
submission_df = test_df[['textID', 'text', 'selected_text']]

In [18]:

print(submission_df)

          textID                                               text  \
0     f87dea47db  Last session of the day  http://twitpic.com/67ezh   
1     96d74cb729   Shanghai is also really exciting (precisely -...   
2     eee518ae67  Recession hit Veronique Branquinho, she has to...   
3     01082688c6                                        happy bday!   
4     33987a8ee5             http://twitpic.com/4w75p - I like it!!   
...          ...                                                ...   
3529  e5f0e6ef4b  its at 3 am, im very tired but i can`t sleep  ...   
3530  416863ce47  All alone in this old house again.  Thanks for...   
3531  6332da480c   I know what you mean. My little dog is sinkin...   
3532  df1baec676  _sutra what is your next youtube video gonna b...   
3533  469e15c5a8   http://twitpic.com/4woj2 - omgssh  ang cute n...   

           selected_text  
0                     th  
1          exciting g rs  
2           shame e c es  
3              happy day  
4           

In [19]:
submission_df.to_csv('submission.csv', index=False)