In [1]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nlpaug.augmenter.word as naw
import warnings
# Ignore warnings
warnings.filterwarnings("ignore")




In [2]:
X_train = pd.read_csv('Spliting Data\\X_train.csv')
y_train = pd.read_csv('Spliting Data\\y_train.csv')

In [3]:
X_train = X_train['Data']
y_train = y_train['Data']

In [4]:
train_dataframe = pd.DataFrame ({'headline': X_train , 'category' :y_train })

In [5]:
train_dataframe.head()

Unnamed: 0,headline,category
0,Germany Detains Suspect In Attack On Top Socce...,OTHER7
1,A Letter to My Walking Daughter,OTHER6
2,Stop Letting Other People's Negativity Affect ...,WELLNESS
3,Massive document leak reveals offshore wealth ...,OTHER7
4,This Woman Put A Game-Changing Twist On Your A...,OTHER1


In [6]:
def take_most_important_samples(df, percent , n_clusters ):
    df_filtered = pd.DataFrame ()

    for i in range (n_clusters):
        temp = df[df['cluster_label'] == i]
        sz = round(percent * temp.shape[0] )
        if sz :
            df_filtered = pd.concat([df_filtered, temp.head(sz)], ignore_index=True)
    return df_filtered

def reduce(category , n_clusters , percent ):
    
    global train_dataframe
    X = train_dataframe[train_dataframe['category'] == category] 
    X = X['headline']
    
    print ("category shape : " ,X.shape) 
    
    vectorizer = CountVectorizer()
    sparse_matrix = vectorizer.fit_transform(X)
    print ("sparse_matrix shape : ",sparse_matrix.shape) 
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(sparse_matrix)
    print ( "cluster_labels shape : " , cluster_labels.shape)
    
    X = pd.DataFrame ({'headline': X , 'cluster_label' :cluster_labels })
    centroids = kmeans.cluster_centers_
    distances_to_centroids = np.linalg.norm(sparse_matrix - centroids[cluster_labels], axis=1)
    importance_values =  1 / (1 + distances_to_centroids )
    print ("importance_values shape : " , importance_values.shape)
    
    X = pd.DataFrame ({'headline': X['headline'] , 'cluster_label' :cluster_labels , "importance" : importance_values })
    print("category shape : ", X.shape)
    
    X = X.groupby('cluster_label').apply(lambda group: group.sort_values(by='importance', ascending=False)).reset_index(drop=True)
    X_filtered = take_most_important_samples(X, percent, n_clusters )
    
    train_dataframe =  train_dataframe[train_dataframe['category'] != category ]
    X_filtered = pd.DataFrame( {'headline': X_filtered['headline'] ,"category": [category] * X_filtered.shape[0] })
    train_dataframe = pd.concat([train_dataframe,X_filtered ], ignore_index=True)
    
    print ("New train_dataframe shape : " , train_dataframe.shape )
    print ("--------------------------------------------------\n\n")

In [7]:
reduce ('POLITICS' , 80 , 0.8)

category shape :  (21232,)
sparse_matrix shape :  (21232, 15580)
cluster_labels shape :  (21232,)
importance_values shape :  (21232,)
category shape :  (21232, 3)
New train_dataframe shape :  (120260, 2)
--------------------------------------------------




In [8]:
train_dataframe['category'].value_counts()

category
POLITICS          16988
WELLNESS          10720
ENTERTAINMENT     10355
OTHER2             9183
OTHER1             8981
OTHER3             8198
OTHER5             7820
OTHER6             7550
OTHER4             7533
OTHER7             7251
TRAVEL             5904
STYLE & BEAUTY     5569
FOOD & DRINK       3800
QUEER VOICES       3794
BUSINESS           3573
SPORTS             3041
Name: count, dtype: int64

In [9]:
aug = naw.SynonymAug()

def augment0_Txt(text):
    return aug.augment(text)
    
def augmentation ( category , target):
    flag = False
    global train_dataframe 
    data_temp = train_dataframe[train_dataframe['category'] == category]
        
    print("category: ", category)
    print("Number of headlines : " , data_temp.shape[0])

    ndata_temp = data_temp.copy()
    if ndata_temp.shape[0] >= target:
        return flag
    for headline in data_temp['headline']:
        if ndata_temp.shape[0] >= target:
            break
        txt1 = augment0_Txt(headline)
        if txt1:
            flag = True
            txt1= txt1[0]
            new_row1 = {'headline': txt1, 'category': category}
            ndata_temp = pd.concat([ndata_temp, pd.DataFrame([new_row1])], ignore_index=True)

    train_dataframe = train_dataframe[train_dataframe['category'] != category]
    print("Number of headlines after augmentation : " , ndata_temp.shape) 
    print ("-----------------------------------------------------------")
    train_dataframe = pd.concat([train_dataframe,ndata_temp])
    return flag

In [10]:
while True:
    ans = False
    for cat in train_dataframe['category'].unique():
        ans |= augmentation(cat,16811)
        
    print("Number of duplicate headlines = ", train_dataframe.duplicated(['headline'], keep = 'last').sum())

    train_dataframe.drop_duplicates(['headline'], keep = 'last', inplace = True)
    
    print("Total number of rows after removing duplicates = ",train_dataframe.shape[0])
    
    if ans == False:
        break

category:  OTHER7
Number of headlines :  7251
Number of headlines after augmentation :  (14502, 2)
-----------------------------------------------------------
category:  OTHER6
Number of headlines :  7550
Number of headlines after augmentation :  (15100, 2)
-----------------------------------------------------------
category:  WELLNESS
Number of headlines :  10720
Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  OTHER1
Number of headlines :  8981
Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  OTHER4
Number of headlines :  7533
Number of headlines after augmentation :  (15066, 2)
-----------------------------------------------------------
category:  OTHER5
Number of headlines :  7820
Number of headlines after augmentation :  (15640, 2)
-----------------------------------------------------------
category:  STYLE & BEAUTY
Number of headlin

Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  OTHER5
Number of headlines :  16811
category:  STYLE & BEAUTY
Number of headlines :  16807
Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  OTHER3
Number of headlines :  16811
category:  TRAVEL
Number of headlines :  16798
Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  ENTERTAINMENT
Number of headlines :  16810
Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  FOOD & DRINK
Number of headlines :  16648
Number of headlines after augmentation :  (16811, 2)
-----------------------------------------------------------
category:  OTHER2
Number of headlines :  16810
Number of headlines after augmentation :  (16811, 2)
--------------------------------

In [11]:
train_dataframe['category'].value_counts()

category
POLITICS          16988
WELLNESS          16811
OTHER5            16811
OTHER3            16811
OTHER7            16811
OTHER1            16811
OTHER4            16811
STYLE & BEAUTY    16811
ENTERTAINMENT     16811
OTHER2            16811
TRAVEL            16811
SPORTS            16811
FOOD & DRINK      16811
QUEER VOICES      16811
BUSINESS          16811
OTHER6            16811
Name: count, dtype: int64

In [12]:
X_train = train_dataframe['headline']
y_train = train_dataframe['category']

In [13]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(X_train), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Spliting Data\\X_train.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)

In [14]:
# Convert the list of strings to a DataFrame
df = pd.DataFrame(list(y_train), columns=['Data'])

# Specify the CSV file path
csv_file_path = "Spliting Data\\y_train.csv"

# Write DataFrame to CSV file
df.to_csv(csv_file_path, index=False)