# Data analysis



## Exploration using pandas

In [5]:
import pandas as pd


In [6]:
# Define the file path
file_path = '../english_comments_2024-05-18/all_comments_translated.csv'

# Load the dataset
df = pd.read_csv(file_path, header=None, names=['Comment', 'Category'])

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Comment,Category
0,Nice words but there are urgent questions that...,politique
1,To build a society to which everyone aspires t...,politique
2,But the retirees did not mention whether they ...,politique
3,It would be preferable if the doors were opene...,politique
4,Experts researchers and sociologists say that ...,politique


In [4]:
# Get basic information about the dataset
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 952 entries, 0 to 951
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Comment   952 non-null    object
 1   Category  952 non-null    object
dtypes: object(2)
memory usage: 15.0+ KB


In [5]:
# Get descriptive statistics for the dataset
df.describe()


Unnamed: 0,Comment,Category
count,952,952
unique,951,11
top,The ministry wants to limit the migration of d...,societe
freq,2,164


In [6]:
# Check the distribution of categories
df['Category'].value_counts()


Category
societe           164
medias            158
sport             155
tamazight         136
international      82
economie           79
faits-divers       70
politique          61
sawt-soura         21
art-et-culture     16
regions            10
Name: count, dtype: int64

In [7]:
# Check for missing values
df.isnull().sum()


Comment     0
Category    0
dtype: int64

In [9]:
# Check for duplicate rows
df.duplicated().sum()

0

## Sentiment analysis using Roberta algorithm

In [10]:
from transformers import AutoTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
from scipy.special import softmax

In [11]:
# Load the RoBERTa tokenizer
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Load the RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained(MODEL)

# Define the sentiment analysis function using RoBERTa
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output.logits.detach().cpu().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0][0],
        'roberta_neu': scores[0][1],
        'roberta_pos': scores[0][2]
    }
    return scores_dict




In [12]:
# Apply sentiment analysis to each comment in the DataFrame
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Comment']  # Assuming 'Comment' is the column containing the comments
        roberta_result = polarity_scores_roberta(text)
        res[i] = roberta_result
    except RuntimeError as e:
        print(f'Broke for index {i} with error {e}')


100%|██████████| 952/952 [00:59<00:00, 16.09it/s]


In [13]:
# Create a DataFrame from the sentiment analysis results
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})


In [14]:
# Merge the sentiment analysis results with the original DataFrame
final_df = results_df.merge(df, left_on='Id', right_index=True)


In [18]:
# Display some of the sentiment analysis results along with the "Category" column to check the output
final_df[['Comment', 'Category', 'roberta_neg', 'roberta_neu', 'roberta_pos']].head(10)



Unnamed: 0,Comment,Category,roberta_neg,roberta_neu,roberta_pos
0,Nice words but there are urgent questions that...,politique,0.352582,0.559292,0.088126
1,To build a society to which everyone aspires t...,politique,0.03797,0.620614,0.341417
2,But the retirees did not mention whether they ...,politique,0.7419,0.244393,0.013707
3,It would be preferable if the doors were opene...,politique,0.140129,0.631564,0.228308
4,Experts researchers and sociologists say that ...,politique,0.794581,0.197406,0.008013
5,This is the juice of Diyal with the correctnes...,politique,0.005623,0.199223,0.795154
6,Indeed this is its condition and this is its m...,politique,0.036773,0.874638,0.088589
7,Thank you to the Moroccan Ambassador Hilal Al ...,politique,0.229273,0.409983,0.360743
8,The largest enemy of Morocco and the Moroccan ...,politique,0.786034,0.206398,0.007568
9,Since almost all states and human rights organ...,politique,0.809871,0.181175,0.008954


In [21]:
# Rename the columns
final_df.rename(columns={'roberta_neg': 'Negative', 'roberta_neu': 'Neutral', 'roberta_pos': 'Positive'}, inplace=True)


In [22]:
# Display the first 10 rows to verify the column renaming
final_df[['Comment', 'Category', 'Negative', 'Neutral', 'Positive']].head(10)


Unnamed: 0,Comment,Category,Negative,Neutral,Positive
0,Nice words but there are urgent questions that...,politique,0.352582,0.559292,0.088126
1,To build a society to which everyone aspires t...,politique,0.03797,0.620614,0.341417
2,But the retirees did not mention whether they ...,politique,0.7419,0.244393,0.013707
3,It would be preferable if the doors were opene...,politique,0.140129,0.631564,0.228308
4,Experts researchers and sociologists say that ...,politique,0.794581,0.197406,0.008013
5,This is the juice of Diyal with the correctnes...,politique,0.005623,0.199223,0.795154
6,Indeed this is its condition and this is its m...,politique,0.036773,0.874638,0.088589
7,Thank you to the Moroccan Ambassador Hilal Al ...,politique,0.229273,0.409983,0.360743
8,The largest enemy of Morocco and the Moroccan ...,politique,0.786034,0.206398,0.007568
9,Since almost all states and human rights organ...,politique,0.809871,0.181175,0.008954


In [30]:
# Create a new DataFrame to store the maximum sentiment score for each comment
max_sentiment_df = final_df[['Comment', 'Category']]


In [31]:
# Determine the maximum sentiment score for each comment
max_sentiment_df['Value'] = final_df[['Negative', 'Neutral', 'Positive']].max(axis=1)
max_sentiment_df['Sentiment'] = final_df[['Negative', 'Neutral', 'Positive']].idxmax(axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  max_sentiment_df['Value'] = final_df[['Negative', 'Neutral', 'Positive']].max(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  max_sentiment_df['Sentiment'] = final_df[['Negative', 'Neutral', 'Positive']].idxmax(axis=1)


In [32]:
# Display the first 10 rows of the new DataFrame
max_sentiment_df.head(10)





Unnamed: 0,Comment,Category,Value,Sentiment
0,Nice words but there are urgent questions that...,politique,0.559292,Neutral
1,To build a society to which everyone aspires t...,politique,0.620614,Neutral
2,But the retirees did not mention whether they ...,politique,0.7419,Negative
3,It would be preferable if the doors were opene...,politique,0.631564,Neutral
4,Experts researchers and sociologists say that ...,politique,0.794581,Negative
5,This is the juice of Diyal with the correctnes...,politique,0.795154,Positive
6,Indeed this is its condition and this is its m...,politique,0.874638,Neutral
7,Thank you to the Moroccan Ambassador Hilal Al ...,politique,0.409983,Neutral
8,The largest enemy of Morocco and the Moroccan ...,politique,0.786034,Negative
9,Since almost all states and human rights organ...,politique,0.809871,Negative


In [33]:
# Save the DataFrame to a new CSV file
max_sentiment_df.to_csv('comments.csv', index=False)
