In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
df = pd.read_csv('..\Data\data.csv')
df.head()


Unnamed: 0,text,label,split
0,"Dr. B.S. CHAUHAN, J. This reference before us...",1,val
1,This special leave petition has been filed ag...,0,val
2,Leave granted. This appeal has been preferred...,1,train
3,"S. RADHAKRISHNAN, J. We are, in this case, co...",0,train
4,"Dr. B. S. CHAUHAN, J. These appeals have been...",0,test


In [6]:
null_text = df[df['text'].isnull()]
print(null_text)


Empty DataFrame
Columns: [text, label, split]
Index: []


In [5]:
df.isnull().sum()  


text     0
label    0
split    0
dtype: int64

In [7]:
df.shape


(54149, 3)

In [8]:
df.describe()


Unnamed: 0,label
count,54149.0
mean,0.505531
std,0.499974
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [None]:
df.info()


In [None]:
# Split the data into train, validation, and test sets
train_size = int(0.7 * len(df))
val_size = int(0.15 * len(df))
test_size = len(df) - train_size - val_size


In [None]:
train_data = df[:train_size]
val_data = df[train_size:train_size + val_size]
test_data = df[train_size + val_size:]


In [None]:
train_labels = train_data['label'].values
val_labels = val_data['label'].values
test_labels = test_data['label'].values


In [None]:
os.makedirs('tfidf_embeddings', exist_ok=True)


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)


In [None]:
train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
val_tfidf = tfidf_vectorizer.transform(val_data['text'])
test_tfidf = tfidf_vectorizer.transform(test_data['text'])


In [None]:
folder = 'tfidf_embeddings'
np.save(os.path.join(folder, 'train_embed.npy'), train_tfidf.toarray())
np.save(os.path.join(folder, 'val_embed.npy'), val_tfidf.toarray())
np.save(os.path.join(folder, 'test_embed.npy'), test_tfidf.toarray())


In [None]:
np.save(os.path.join(folder, 'train_labels.npy'), train_labels)
np.save(os.path.join(folder, 'val_labels.npy'), val_labels)
np.save(os.path.join(folder, 'test_labels.npy'), test_labels)
