# Import Module

In [74]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load Data

In [51]:
tweets_DM = pd.read_json('dm-2024-isa-5810-lab-2-homework/tweets_DM.json', lines=True)
tweets_DM

Unnamed: 0,_score,_index,_source,_crawldate,_type
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets
2,232,hashtag_tweets,"{'tweet': {'hashtags': ['bibleverse'], 'tweet_...",2017-12-25 04:39:20,tweets
3,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets
4,989,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2de2...",2016-01-08 17:18:59,tweets
...,...,...,...,...,...
1867530,827,hashtag_tweets,"{'tweet': {'hashtags': ['mixedfeeling', 'butim...",2015-05-12 12:51:52,tweets
1867531,368,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x29d0...",2017-10-02 17:54:04,tweets
1867532,498,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2a6a...",2016-10-10 11:04:32,tweets
1867533,840,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x24fa...",2016-09-02 14:25:06,tweets


In [52]:
data_identification = pd.read_csv('dm-2024-isa-5810-lab-2-homework/data_identification.csv')
data_identification

Unnamed: 0,tweet_id,identification
0,0x28cc61,test
1,0x29e452,train
2,0x2b3819,train
3,0x2db41f,test
4,0x2a2acc,train
...,...,...
1867530,0x227e25,train
1867531,0x293813,train
1867532,0x1e1a7e,train
1867533,0x2156a5,train


In [53]:
emotion = pd.read_csv('dm-2024-isa-5810-lab-2-homework/emotion.csv')
emotion

Unnamed: 0,tweet_id,emotion
0,0x3140b1,sadness
1,0x368b73,disgust
2,0x296183,anticipation
3,0x2bd6e1,joy
4,0x2ee1dd,anticipation
...,...,...
1455558,0x38dba0,joy
1455559,0x300ea2,joy
1455560,0x360b99,fear
1455561,0x22eecf,joy


In [54]:
sampleSubmission = pd.read_csv('dm-2024-isa-5810-lab-2-homework/sampleSubmission.csv')
sampleSubmission

Unnamed: 0,id,emotion
0,0x2c7743,surprise
1,0x2c1eed,surprise
2,0x2826ea,surprise
3,0x356d9a,surprise
4,0x20fd95,surprise
...,...,...
411967,0x351857,surprise
411968,0x2c028e,surprise
411969,0x1f2430,surprise
411970,0x2be24e,surprise


# Check Data

In [55]:
tweets_DM['_index'].nunique()

1

In [56]:
tweets_DM['_type'].nunique()

1

In [57]:
first_row = tweets_DM.iloc[0]
first_row['_source']

{'tweet': {'hashtags': ['Snapchat'],
  'tweet_id': '0x376b20',
  'text': 'People who post "add me on #Snapchat" must be dehydrated. Cuz man.... that\'s <LH>'}}

In [58]:
second_row = tweets_DM.iloc[1]
second_row['_source']['tweet']

{'hashtags': ['freepress', 'TrumpLegacy', 'CNN'],
 'tweet_id': '0x2d5350',
 'text': '@brianklaas As we see, Trump is dangerous to #freepress around the world. What a <LH> <LH> #TrumpLegacy.  #CNN'}

# Preprocessing

In [59]:
processed_tweets_DM = tweets_DM.copy()

def get_id(row):
    return row['_source'].get('tweet').get('tweet_id')
processed_tweets_DM['tweet_id'] = processed_tweets_DM.apply(get_id, axis=1)
print("finish get id")

def get_hashtags(row):
    return row['_source'].get('tweet').get('hashtags')
processed_tweets_DM['hashtags'] = processed_tweets_DM.apply(get_hashtags, axis=1)
print("finish get hashtags")

def get_text(row):
    return row['_source'].get('tweet').get('text')
processed_tweets_DM['text'] = processed_tweets_DM.apply(get_text, axis=1)
print("finish get text")

processed_tweets_DM.drop(['_index', '_source', '_type'], axis=1, inplace=True)
processed_tweets_DM

finish get id
finish get hashtags
finish get text


Unnamed: 0,_score,_crawldate,tweet_id,hashtags,text
0,391,2015-05-23 11:42:47,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ..."
1,433,2016-01-28 04:52:09,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #..."
2,232,2017-12-25 04:39:20,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k..."
3,376,2016-01-24 23:53:05,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>
4,989,2016-01-08 17:18:59,0x2de201,[],"""Trust is not the same as faith. A friend is s..."
...,...,...,...,...,...
1867530,827,2015-05-12 12:51:52,0x316b80,"[mixedfeeling, butimTHATperson]",When you buy the last 2 tickets remaining for ...
1867531,368,2017-10-02 17:54:04,0x29d0cb,[],I swear all this hard work gone pay off one da...
1867532,498,2016-10-10 11:04:32,0x2a6a4f,[],@Parcel2Go no card left when I wasn't in so I ...
1867533,840,2016-09-02 14:25:06,0x24faed,[],"Ah, corporate life, where you can date <LH> us..."


In [60]:
processed_tweets_DM = pd.merge(processed_tweets_DM, data_identification, on='tweet_id')
processed_tweets_DM['identification'].isna().sum()

0

In [61]:
train_df = processed_tweets_DM[processed_tweets_DM['identification']=="train"].drop(['identification'], axis=1)
test_df = processed_tweets_DM[processed_tweets_DM['identification']=="test"].drop(['identification'], axis=1)
train_df = pd.merge(train_df, emotion, on='tweet_id')

unique_categories = train_df['emotion'].unique()
category_to_number = {category: number for number, category in enumerate(unique_categories, start=1)}
train_df['numerical_emotion'] = train_df['emotion'].map(category_to_number)

In [62]:
train_df

Unnamed: 0,_score,_crawldate,tweet_id,hashtags,text,emotion,numerical_emotion
0,391,2015-05-23 11:42:47,0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",anticipation,1
1,433,2016-01-28 04:52:09,0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",sadness,2
2,376,2016-01-24 23:53:05,0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,fear,3
3,120,2015-06-11 04:44:05,0x1d755c,"[authentic, LaughOutLoud]",@RISKshow @TheKevinAllison Thx for the BEST TI...,joy,4
4,1021,2015-08-18 02:30:07,0x2c91a8,[],Still waiting on those supplies Liscus. <LH>,anticipation,1
...,...,...,...,...,...,...,...
1455558,94,2016-12-26 02:44:07,0x321566,"[NoWonder, Happy]",I'm SO HAPPY!!! #NoWonder the name of this sho...,joy,4
1455559,627,2015-04-01 08:14:56,0x38959e,[],In every circumtance I'd like to be thankful t...,joy,4
1455560,274,2016-11-17 23:46:22,0x2cbca6,[blessyou],there's currently two girls walking around the...,joy,4
1455561,840,2016-09-02 14:25:06,0x24faed,[],"Ah, corporate life, where you can date <LH> us...",joy,4


In [63]:
test_df

Unnamed: 0,_score,_crawldate,tweet_id,hashtags,text
2,232,2017-12-25 04:39:20,0x28b412,[bibleverse],"Confident of your obedience, I write to you, k..."
4,989,2016-01-08 17:18:59,0x2de201,[],"""Trust is not the same as faith. A friend is s..."
9,66,2015-09-09 09:22:55,0x218443,"[materialism, money, possessions]",When do you have enough ? When are you satisfi...
30,104,2015-10-10 14:33:26,0x2939d5,"[GodsPlan, GodsWork]","God woke you up, now chase the day #GodsPlan #..."
33,310,2016-10-23 08:49:50,0x26289a,[],"In these tough times, who do YOU turn to as yo..."
...,...,...,...,...,...
1867525,602,2016-12-10 18:01:00,0x2913b4,[],"""For this is the message that ye heard from th..."
1867529,598,2015-01-04 14:40:55,0x2a980e,[],"""There is a lad here, which hath five barley l..."
1867530,827,2015-05-12 12:51:52,0x316b80,"[mixedfeeling, butimTHATperson]",When you buy the last 2 tickets remaining for ...
1867531,368,2017-10-02 17:54:04,0x29d0cb,[],I swear all this hard work gone pay off one da...


In [64]:
train_df.emotion.unique()

array(['anticipation', 'sadness', 'fear', 'joy', 'anger', 'trust',
       'disgust', 'surprise'], dtype=object)

# Training and Predicting

### 1. tfidf matrix & MultinomialNB

In [77]:
tfidf_vectorizer = TfidfVectorizer(max_features=20000)  # Limit number of terms if needed
#tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_df['text'])
tfidf_matrix_train = tfidf_vectorizer.transform(train_df['text'])
terms = tfidf_vectorizer.get_feature_names_out()

category_mapping = dict(train_df[['numerical_emotion', 'emotion']].drop_duplicates().values)
target_names = [category_mapping[label] for label in sorted(category_mapping.keys())]
print(target_names)

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_train, train_df['numerical_emotion'], test_size=0.3, random_state=42)

['anticipation', 'sadness', 'fear', 'joy', 'anger', 'trust', 'disgust', 'surprise']


In [78]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_test_pred = nb_classifier.predict(X_test)
y_train_pred = nb_classifier.predict(X_train)

print("test: ")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred, target_names=target_names, digits=4))

print("train: ")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred, target_names=target_names, digits=4))

test: 
Accuracy: 0.5195124911546274

Classification Report:
               precision    recall  f1-score   support

anticipation     0.6474    0.4537    0.5335     74942
     sadness     0.5180    0.4063    0.4554     58054
        fear     0.8369    0.2586    0.3951     19225
         joy     0.4794    0.8849    0.6219    154623
       anger     0.8273    0.1153    0.2023     11930
       trust     0.6412    0.1814    0.2828     61740
     disgust     0.5043    0.3179    0.3899     41475
    surprise     0.8472    0.1167    0.2051     14680

    accuracy                         0.5195    436669
   macro avg     0.6627    0.3418    0.3858    436669
weighted avg     0.5762    0.5195    0.4792    436669

train: 
Accuracy: 0.5327727908889442

Classification Report:
               precision    recall  f1-score   support

anticipation     0.6591    0.4655    0.5457    173993
     sadness     0.5428    0.4272    0.4781    135383
        fear     0.8487    0.2675    0.4068     44774
         

In [79]:
tfidf_matrix_test = tfidf_vectorizer.transform(test_df['text'])
y_pred = nb_classifier.predict(tfidf_matrix_test)

In [80]:
list_of_tuples = list(zip(y_pred, test_df['tweet_id'].values))
result = pd.DataFrame(list_of_tuples, columns=['emotion', 'id'])
result['emotion'] = result['emotion'].apply(lambda r: category_mapping.get(r))

In [81]:
result

Unnamed: 0,emotion,id
0,anticipation,0x28b412
1,anticipation,0x2de201
2,joy,0x218443
3,joy,0x2939d5
4,joy,0x26289a
...,...,...
411967,anticipation,0x2913b4
411968,anticipation,0x2a980e
411969,joy,0x316b80
411970,joy,0x29d0cb


In [82]:
result.to_csv('result.csv', index=False)

### 2. tfidf matrix & Decision Trees

In [83]:
tfidf_vectorizer = TfidfVectorizer(max_features=20000)  # Limit number of terms if needed
#tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_df['text'])
tfidf_matrix_train = tfidf_vectorizer.transform(train_df['text'])
terms = tfidf_vectorizer.get_feature_names_out()

category_mapping = dict(train_df[['numerical_emotion', 'emotion']].drop_duplicates().values)
target_names = [category_mapping[label] for label in sorted(category_mapping.keys())]
print(target_names)

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_train, train_df['numerical_emotion'], test_size=0.3, random_state=42)

['anticipation', 'sadness', 'fear', 'joy', 'anger', 'trust', 'disgust', 'surprise']


In [86]:
DT_model = DecisionTreeClassifier(random_state=1, min_samples_split=50, min_samples_leaf=25)
DT_model = DT_model.fit(X_train, y_train)

y_train_pred = DT_model.predict(X_train)
y_test_pred = DT_model.predict(X_test)

print("test: ")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred, target_names=target_names, digits=4))

print("train: ")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred, target_names=target_names, digits=4))

test: 
Accuracy: 0.4739654062917221

Classification Report:
               precision    recall  f1-score   support

anticipation     0.4831    0.4623    0.4725     74942
     sadness     0.3995    0.3877    0.3935     58054
        fear     0.6812    0.2633    0.3798     19225
         joy     0.5016    0.7310    0.5950    154623
       anger     0.6687    0.1560    0.2530     11930
       trust     0.4297    0.2677    0.3299     61740
     disgust     0.3446    0.2606    0.2968     41475
    surprise     0.7763    0.1723    0.2821     14680

    accuracy                         0.4740    436669
   macro avg     0.5356    0.3376    0.3753    436669
weighted avg     0.4815    0.4740    0.4520    436669

train: 
Accuracy: 0.5502927684332227

Classification Report:
               precision    recall  f1-score   support

anticipation     0.5665    0.5473    0.5568    173993
     sadness     0.4940    0.4823    0.4881    135383
        fear     0.7147    0.2884    0.4110     44774
         

In [87]:
tfidf_matrix_test = tfidf_vectorizer.transform(test_df['text'])
y_pred = DT_model.predict(tfidf_matrix_test)

In [88]:
list_of_tuples = list(zip(y_pred, test_df['tweet_id'].values))
result = pd.DataFrame(list_of_tuples, columns=['emotion', 'id'])
result['emotion'] = result['emotion'].apply(lambda r: category_mapping.get(r))

In [89]:
result

Unnamed: 0,emotion,id
0,trust,0x28b412
1,anticipation,0x2de201
2,joy,0x218443
3,joy,0x2939d5
4,joy,0x26289a
...,...,...
411967,joy,0x2913b4
411968,disgust,0x2a980e
411969,anticipation,0x316b80
411970,joy,0x29d0cb


In [90]:
result.to_csv('result.csv', index=False)

### 3. PAMI

In [None]:
#Create separate DataFrames for each category
categories = X['category_name'].unique()  # Get unique category labels 四種不同的 category
category_dfs = {}  # Dictionary to store DataFrames for each category

for category in categories:
    # Filter the original DataFrame by category
    category_dfs[category] = X[X['category_name'] == category].copy()

# Function to create term-document frequency DataFrame for each category
def create_term_document_df(df):
    count_vect = CountVectorizer()  # Initialize the CountVectorizer
    X_counts = count_vect.fit_transform(df['text'])  # Transform the text data into word counts
    
    # Get the unique words (vocabulary) from the vectorizer
    words = count_vect.get_feature_names_out()
    
    # Create a DataFrame where rows are documents and columns are words
    term_document_df = pd.DataFrame(X_counts.toarray(), columns=words)
    
    return term_document_df

# Create term-document frequency DataFrames for each category
term_document_dfs = {}  # Dictionary to store term-document DataFrames for each category

for category in categories:
    term_document_dfs[category] = create_term_document_df(category_dfs[category])

In [None]:
from PAMI.extras.DF2DB import DenseFormatDF as db

# Loop through the dictionary of term-document DataFrames
for category in term_document_dfs:
    # Replace dots with underscores in the category name to avoid errors in the file creation
    category_safe = category.replace('.', '_')

    # Create the DenseFormatDF object and convert to a transactional database
    obj = db.DenseFormatDF(term_document_dfs[category])
    obj.convert2TransactionalDatabase(f'td_freq_db_{category_safe}.csv', '>=', 1)
    #使用頻率大於等於1的(有使用的)都會被加入 transaction 中

### 4. Dimension reduction

### 5. 