In [3]:
import pandas as pd
from termcolor import colored
from sklearn.model_selection import train_test_split

In [4]:

# Define columns to load
COLUMNS = ['Sentiment', 'Tweet']
print(colored("Loading useful columns: Sentiment and Tweet", "yellow"))

Loading useful columns: Sentiment and Tweet


In [5]:
# Load dataset with only the required columns
dataset = pd.read_csv('/content/tweets.csv', usecols=[0, 5], names=COLUMNS, encoding='latin-1', skiprows=1)
print(colored("Data loaded with {} rows and {} columns".format(dataset.shape[0], dataset.shape[1]), "yellow"))


Data loaded with 1599999 rows and 2 columns


In [6]:
# Train-test split
print(colored("Splitting dataset into 80% train and 20% test", "yellow"))
X_train, X_test, y_train, y_test = train_test_split(
    dataset['Tweet'], dataset['Sentiment'], test_size=0.20, random_state=100
)

Splitting dataset into 80% train and 20% test


In [7]:
# Create train and test datasets
train_dataset = pd.DataFrame({'Tweet': X_train, 'Sentiment': y_train}).reset_index(drop=True)
test_dataset = pd.DataFrame({'Tweet': X_test, 'Sentiment': y_test}).reset_index(drop=True)

In [8]:
# Display distributions for verification
print(colored("Train data sentiment distribution:", "yellow"))
print(train_dataset['Sentiment'].value_counts())
print(colored("Test data sentiment distribution:", "yellow"))
print(test_dataset['Sentiment'].value_counts())
print(colored("Split complete", "yellow"))

Train data sentiment distribution:
Sentiment
0    640138
4    639861
Name: count, dtype: int64
Test data sentiment distribution:
Sentiment
4    160139
0    159861
Name: count, dtype: int64
Split complete


In [9]:
# Save datasets
print(colored("Saving train data", "yellow"))
train_dataset.to_csv('/content/train.csv', index=False)
print(colored("Train data saved to /content/train.csv", "green"))

print(colored("Saving test data", "yellow"))
test_dataset.to_csv('/content/test.csv', index=False)
print(colored("Test data saved to /content/test.csv", "green"))

Saving train data
Train data saved to /content/train.csv
Saving test data
Test data saved to /content/test.csv
