## Importing the files and doing bare minimum addn'l prep to them

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
X_train = pd.read_csv('../../data/X_train',index_col= 0)
y_train = pd.read_csv('../../data/y_train', index_col =0)
X_test = pd.read_csv('../../data/X_test', index_col = 0)
y_test = pd.read_csv('../../data/y_test', index_col = 0)

In [3]:
def remove_unsure(score):
    if score == 3:
        return 1
    else: 
        return score

In [4]:
y_test['emotion'] = y_test['emotion'].apply(lambda x: remove_unsure(x))

In [5]:
y_train['emotion'] = y_train['emotion'].apply(lambda x: remove_unsure(x))

In [6]:
X_train[X_train.isna().any(axis=1)]


Unnamed: 0,cleaned_tweets
6,
7346,


In [7]:
y_train = y_train.drop(index=[6,7346])


In [8]:
X_train.dropna(inplace= True)

In [9]:
X_train.isna().sum()

cleaned_tweets    0
dtype: int64

In [10]:
X_train.shape, y_train.shape

((6363, 1), (6363, 1))

In [11]:
X_test.isna().sum()

cleaned_tweets    0
dtype: int64

In [12]:
X_test.shape, y_test.shape

((7274, 1), (7274, 1))

### Vectorize DFs


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf_vec = TfidfVectorizer()

In [17]:
X_tr_tfidf = tfidf_vec.fit_transform(X_train['cleaned_tweets'])
X_te_tfidf = tfidf_vec.transform(X_test['cleaned_tweets'])

In [21]:
X_tr_tfidf_df = pd.DataFrame(X_tr_tfidf.toarray(), 
                              columns=tfidf_vec.get_feature_names(), 
                              index=X_train.index)

In [22]:
X_te_tfidf_df = pd.DataFrame(X_te_tfidf.toarray(),
                            columns = tfidf_vec.get_feature_names(),
                            index= X_test.index)

## Dimensionality Reduction 

Dimensionality reduction using truncated SVD (aka LSA).

This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. **This means it can work with sparse matrices efficiently.**

**In particular, truncated SVD works on term count/tf-idf matrices as returned by the vectorizers in sklearn.feature_extraction.text. In that context, it is known as latent semantic analysis (LSA).**

n_componentsint, default=2
Desired dimensionality of output data. Must be strictly less than the number of features. **The default value is useful for visualisation. For LSA, a value of 100 is recommended.**

In [24]:
from sklearn.decomposition import TruncatedSVD

In [26]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
svd.fit(X_tr_tfidf_df)
X_tr_tfidf_SVD = svd.transform(X_tr_tfidf_df)

In [28]:
X_tr_tfidf_SVD.shape

(6363, 100)

In [30]:
X_te_tfidf_SVD = svd.transform(X_te_tfidf_df)

In [31]:
from sklearn.cluster import KMeans

In [32]:
k_means = KMeans(n_clusters=3).fit(X_tr_tfidf_SVD)

predicted_clusters = k_means.labels_

In [33]:
predicted_clusters

array([0, 0, 0, ..., 0, 2, 0])

In [34]:
centers = k_means.cluster_centers_

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=predicted_clusters, s=10)
plt.scatter(centers[:, 0], centers[:, 1], c='r', s=10)