# Manage Datasets

In [2]:
import pandas as pd

## Gathering Data

In [3]:
sentiment_chatgpt = pd.read_csv('E:\MSIB_BANGKIT\CAPSTONE_C241-PR565\data\datasets_csv\chatgpt_sentiment.csv')
sentiment_mentalhealth = pd.read_csv('E:\MSIB_BANGKIT\CAPSTONE_C241-PR565\data\datasets_csv\Mental_Health_Dataset_sentiment.csv')

In [4]:
# Load some first line of datasets

print("Sentiment ChatGPT")
print(sentiment_chatgpt.head())
print("\nSentiment Mental Health")
print(sentiment_mentalhealth.head())

Sentiment ChatGPT
   Unnamed: 0                                             tweets   labels
0           0  ChatGPT: Optimizing Language Models for Dialog...  neutral
1           1  Try talking with ChatGPT, our new AI system wh...     good
2           2  ChatGPT: Optimizing Language Models for Dialog...  neutral
3           3  THRILLED to share that ChatGPT, our new model ...     good
4           4  As of 2 minutes ago, @OpenAI released their ne...      bad

Sentiment Mental Health
                                               posts predicted  intensity
0  I know as parent of child with down syndrome t...  negative         -1
1  but in my heart I know this is the future prom...   neutral          0
2  I have mylefibrosis which turn to leukemia the...  negative         -1
3  from one of my health group subject wayne dyer...   neutral          0
4  gmos now link to leukemia http nsnbc I 2013 07...   neutral          0


## Assessing Data
We only need text column and its sentiment. So let drop them first to assess data easier

In [5]:
# Sentiment ChatGPT
#calling unnamed column to drop
unnamed_col = [col for col in sentiment_chatgpt.columns if col.startswith('Unnamed:')]
sentiment_chatgpt = sentiment_chatgpt.drop(columns = unnamed_col)

# Sentiment Mental Health
sentiment_mentalhealth = sentiment_mentalhealth.drop(columns = ['intensity'])

In [6]:
# Load some first line of datasets after dropping

print("Sentiment ChatGPT")
print(sentiment_chatgpt.head())
print("\nSentiment Mental Health")
print(sentiment_mentalhealth.head())

Sentiment ChatGPT
                                              tweets   labels
0  ChatGPT: Optimizing Language Models for Dialog...  neutral
1  Try talking with ChatGPT, our new AI system wh...     good
2  ChatGPT: Optimizing Language Models for Dialog...  neutral
3  THRILLED to share that ChatGPT, our new model ...     good
4  As of 2 minutes ago, @OpenAI released their ne...      bad

Sentiment Mental Health
                                               posts predicted
0  I know as parent of child with down syndrome t...  negative
1  but in my heart I know this is the future prom...   neutral
2  I have mylefibrosis which turn to leukemia the...  negative
3  from one of my health group subject wayne dyer...   neutral
4  gmos now link to leukemia http nsnbc I 2013 07...   neutral


In [7]:
# Check any missing value
sentiment_chatgpt.info()
sentiment_mentalhealth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219294 entries, 0 to 219293
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tweets  219294 non-null  object
 1   labels  219294 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10392 entries, 0 to 10391
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   posts      10391 non-null  object
 1   predicted  10392 non-null  object
dtypes: object(2)
memory usage: 162.5+ KB


We only need "input" wich is a text and "sentiment". Then we make them in one datasets.

In [8]:
#Rename column to "text" and "sentiment"
sentiment_chatgpt = sentiment_chatgpt.rename(columns = {'tweets':'text', 'labels':'sentiment'})
sentiment_mentalhealth = sentiment_mentalhealth.rename(columns = {'posts':'text', 'predicted':'sentiment'})

In [9]:
# Load some first line of datasets after rename

print("Sentiment ChatGPT")
print(sentiment_chatgpt.head())
print("\nSentiment Mental Health")
print(sentiment_mentalhealth.head())

Sentiment ChatGPT
                                                text sentiment
0  ChatGPT: Optimizing Language Models for Dialog...   neutral
1  Try talking with ChatGPT, our new AI system wh...      good
2  ChatGPT: Optimizing Language Models for Dialog...   neutral
3  THRILLED to share that ChatGPT, our new model ...      good
4  As of 2 minutes ago, @OpenAI released their ne...       bad

Sentiment Mental Health
                                                text sentiment
0  I know as parent of child with down syndrome t...  negative
1  but in my heart I know this is the future prom...   neutral
2  I have mylefibrosis which turn to leukemia the...  negative
3  from one of my health group subject wayne dyer...   neutral
4  gmos now link to leukemia http nsnbc I 2013 07...   neutral


In [10]:
# Combine all datasets into one dataset
combined_dataset = pd.concat([sentiment_chatgpt, sentiment_mentalhealth], ignore_index = True)

print(combined_dataset)

                                                     text      sentiment
0       ChatGPT: Optimizing Language Models for Dialog...        neutral
1       Try talking with ChatGPT, our new AI system wh...           good
2       ChatGPT: Optimizing Language Models for Dialog...        neutral
3       THRILLED to share that ChatGPT, our new model ...           good
4       As of 2 minutes ago, @OpenAI released their ne...            bad
...                                                   ...            ...
229681  hey everyone I am a 25 year old male I work ou...       negative
229682  have surgery for stage 1 colon cancer 1 year a...  very negative
229683  the doctor advise we he could not remove the a...        neutral
229684  my 66 year old father have been through so muc...        neutral
229685  I have bein have a bloody stool since last yea...       negative

[229686 rows x 2 columns]


In [11]:
# Print what sentiments we have in dataset
unique_sent = combined_dataset['sentiment'].value_counts()
print(unique_sent)

sentiment
bad              107796
neutral           59862
good              56011
negative           4112
very negative      1155
positive            750
Name: count, dtype: int64


To make classify easier, we will make "positive" as "good", "very negative" and "negative" as "bad"

In [12]:
combined_dataset['sentiment'] = combined_dataset['sentiment'].replace(['negative', 'very negative'], 'bad')
combined_dataset['sentiment'] = combined_dataset['sentiment'].replace(['positive'], 'good')

In [13]:
# Print again what sentiments we have in dataset to know if it changes
unique_sent = combined_dataset['sentiment'].value_counts()
print(unique_sent)

sentiment
bad        113063
neutral     59862
good        56761
Name: count, dtype: int64


Data dengan label "bad" terlalu banyak dan dapat memengaruhi training nantinya. Oleh karena itu, dilakukan undersampling.

In [14]:
# separate data based on labels
bad_data = combined_dataset[combined_dataset['sentiment']=='bad']
neutral_data = combined_dataset[combined_dataset['sentiment']=='neutral']
good_data = combined_dataset[combined_dataset['sentiment']=='good']

# N samples for each label
n_samples = min(len(neutral_data), len(good_data))

# undersampling bad_data
bad_data = bad_data.sample(n = n_samples, random_state = 42)

# combine all data
new_dataset = pd.concat([bad_data, neutral_data, good_data])

In [15]:
# Print again what sentiments we have in dataset to know if it changes
uniques_sent = new_dataset['sentiment'].value_counts()
print(uniques_sent)

sentiment
neutral    59862
bad        56761
good       56761
Name: count, dtype: int64


In [16]:
# Save combine dataset into new csv file
combined_dataset.to_csv('E:\MSIB_BANGKIT\CAPSTONE_C241-PR565\data\capstone_new_dataset.csv', index = False)