# Sentiment Analysis: Weighted Scores + TextBlob Polarity

In [1]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maryamkhalid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maryamkhalid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maryamkhalid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/maryamkhalid/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Step 1: Load the Dataset

In [4]:
df = pd.read_csv("supset1_NLP.csv")
df.head()

Unnamed: 0,SURVEYR,descrip_E,QUESTION,TITLE_E,ANSWER1,Strongly Agree,ANSWER2,Somewhat Agree,ANSWER3,Neither Agree Nor Disagree,ANSWER4,Somewhat Disagree,ANSWER5,Strongly Disagree,ANSWER6,Don't Know,ANSWER7,Not Applicable
0,2022,Public Service,Q01,"Question 1. I have the tools, technology and e...",forty three percent voted for,I have all the equipment I need.,forty one percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,nine percent voted for,I lack some necessary equipment.,four percent voted for,I do not have the equipment I need.,zero percent voted for this option.,I am unsure.,zero percent voted for,This does not apply to me.
1,2022,Health Canada,Q01,"Question 1. I have the tools, technology and e...",thirty nine percent voted for,I have all the equipment I need.,forty four percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,ten percent voted for,I lack some necessary equipment.,four percent voted for,I do not have the equipment I need.,zero percent voted for this option.,I am unsure.,zero percent voted for,This does not apply to me.
2,2020,Public Service,Q01,"Question 1. I have the tools, technology and e...",thirty seven percent voted for,I have all the equipment I need.,forty five percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,ten percent voted for,I lack some necessary equipment.,three percent voted for,I do not have the equipment I need.,zero percent voted for this option.,I am unsure.,zero percent voted for,This does not apply to me.
3,2020,Health Canada,Q01,"Question 1. I have the tools, technology and e...",thirty eight percent voted for,I have all the equipment I need.,forty six percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,ten percent voted for,I lack some necessary equipment.,two percent voted for,I do not have the equipment I need.,zero percent voted for this option.,I am unsure.,zero percent voted for,This does not apply to me.
4,2022,Public Service,Q02,Question 2. The material and tools provided fo...,seventy five percent voted for,All tools are available in my preferred offici...,eighteen percent voted for,Most tools are available in my preferred offic...,three percent voted for,I am neutral about the availability of tools i...,two percent voted for,Some tools are not available in my preferred o...,one percent voted for,Tools are not available in my preferred offici...,zero percent voted for this option.,I am unsure.,one percent voted for,This does not apply to me.


## Step 2: Combine Columns into Full Text

In [7]:
columns_to_combine = ['QUESTION', 'TITLE_E', 'ANSWER1', 'Strongly Agree', 'ANSWER2', 'Somewhat Agree',
                      'ANSWER3', 'Neither Agree Nor Disagree', 'ANSWER4', 'Somewhat Disagree', 
                      'ANSWER5', 'Strongly Disagree']
df['Combined_Text'] = df[columns_to_combine].astype(str).agg(' '.join, axis=1)


## Step 3: Define NLP Preprocessing Function

In [10]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t, pos='v') for t in tokens]
    return ' '.join(tokens)


## Step 4: Apply Preprocessing and Polarity

In [13]:
df['Cleaned_Text'] = df['Combined_Text'].apply(preprocess_text)
df['Polarity_Score'] = df['Cleaned_Text'].apply(lambda x: TextBlob(x).sentiment.polarity)


## Step 5: Extract Numeric Percentages and Calculate Weighted Score

In [16]:
# Manual word-to-number mapping
word_to_num = {
    'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
    'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14,
    'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18,
    'nineteen': 19, 'twenty': 20, 'twenty one': 21, 'twenty two': 22,
    'twenty three': 23, 'twenty four': 24, 'twenty five': 25,
    'twenty six': 26, 'twenty seven': 27, 'twenty eight': 28,
    'twenty nine': 29, 'thirty': 30, 'thirty one': 31, 'thirty two': 32,
    'thirty three': 33, 'thirty four': 34, 'thirty five': 35,
    'thirty six': 36, 'thirty seven': 37, 'thirty eight': 38,
    'thirty nine': 39, 'forty': 40, 'forty one': 41, 'forty two': 42,
    'forty three': 43, 'forty four': 44, 'forty five': 45,
    'forty six': 46, 'forty seven': 47, 'forty eight': 48,
    'forty nine': 49, 'fifty': 50, 'seventy five': 75
}

def extract_percent(text):
    if not isinstance(text, str):
        return 0
    return word_to_num.get(text.lower().replace("percent voted for", "").strip(), 0)

for i in range(1, 6):
    col = f'ANSWER{i}'
    df[f'percent_{col.lower()}'] = df[col].apply(extract_percent)

df['Weighted_Score'] = (
    df['percent_answer1'] * 2 +
    df['percent_answer2'] * 1 +
    df['percent_answer3'] * 0 +
    df['percent_answer4'] * -1 +
    df['percent_answer5'] * -2
)


## Step 6: Calculate Sentiment Score

In [19]:
# Normalize Weighted_Score to [-1, 1] scale
df['Weighted_Score_Norm'] = (df['Weighted_Score'] - df['Weighted_Score'].min()) / (
    df['Weighted_Score'].max() - df['Weighted_Score'].min()) * 2 - 1

# Hybrid score: 70% weighted, 30% polarity
df['Hybrid_Score'] = 0.7 * df['Weighted_Score_Norm'] + 0.3 * df['Polarity_Score']

# Final sentiment classification
def classify_sentiment(score):
    if score > 0.1:
        return 'Positive'
    elif score < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

df['Hybrid_Sentiment'] = df['Hybrid_Score'].apply(classify_sentiment)
df[['Weighted_Score', 'Polarity_Score', 'Hybrid_Score', 'Hybrid_Sentiment']].head()


Unnamed: 0,Weighted_Score,Polarity_Score,Hybrid_Score,Hybrid_Sentiment
0,110,0.0,0.35,Positive
1,104,0.0,0.317692,Positive
2,103,0.0,0.312308,Positive
3,108,0.0,0.339231,Positive
4,164,0.4,0.760769,Positive


In [25]:
df.head()

Unnamed: 0,SURVEYR,descrip_E,QUESTION,TITLE_E,ANSWER1,Strongly Agree,ANSWER2,Somewhat Agree,ANSWER3,Neither Agree Nor Disagree,...,Polarity_Score,percent_answer1,percent_answer2,percent_answer3,percent_answer4,percent_answer5,Weighted_Score,Weighted_Score_Norm,Hybrid_Score,Hybrid_Sentiment
0,2022,Public Service,Q01,"Question 1. I have the tools, technology and e...",forty three percent voted for,I have all the equipment I need.,forty one percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,...,0.0,43,41,4,9,4,110,0.5,0.35,Positive
1,2022,Health Canada,Q01,"Question 1. I have the tools, technology and e...",thirty nine percent voted for,I have all the equipment I need.,forty four percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,...,0.0,39,44,4,10,4,104,0.453846,0.317692,Positive
2,2020,Public Service,Q01,"Question 1. I have the tools, technology and e...",thirty seven percent voted for,I have all the equipment I need.,forty five percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,...,0.0,37,45,4,10,3,103,0.446154,0.312308,Positive
3,2020,Health Canada,Q01,"Question 1. I have the tools, technology and e...",thirty eight percent voted for,I have all the equipment I need.,forty six percent voted for,I have most of the equipment I need.,four percent voted for,I am unsure if I have all the equipment I need.,...,0.0,38,46,4,10,2,108,0.484615,0.339231,Positive
4,2022,Public Service,Q02,Question 2. The material and tools provided fo...,seventy five percent voted for,All tools are available in my preferred offici...,eighteen percent voted for,Most tools are available in my preferred offic...,three percent voted for,I am neutral about the availability of tools i...,...,0.4,75,18,3,2,1,164,0.915385,0.760769,Positive


## Step 7: Save Final Output

In [22]:
df.to_csv("sentiment_results.csv", index=False)
print("Saved to sentiment_results.csv")


Saved to sentiment_results.csv
