In [1]:
import pandas as pd
import numpy as np
import torch

from tqdm import tqdm

from utils import ComputeCleanTexts, ChangeTimeFormat, CountCommaSeparated, CountSpaceSeperated

### 1. Load data

In [2]:
processed_train_data = pd.read_csv("../Data/train.csv")
processed_eval_data = pd.read_csv("../Data/evaluation.csv")

### 2. Compute clean texts

Clean texts are tokenized, lemmatized and filtered.

In [3]:
processed_train_data['clean_text'] = ComputeCleanTexts(processed_train_data['text'])

665777it [01:47, 6202.25it/s]


In [4]:
for i in range(5):            
    print(i,": "+processed_train_data.loc[i,'text'])

0 : Smh I give up
1 : Most of us are Human Beings, but I think you miss that boat George...
2 : Old dirty tricks Trump, at it again...like we don't know what Fauci would say! Ha Ha Ha ha ha ha ha
3 : Seriously..... I worked 86 hours my last check and it didn’t even come close to this....
4 : May ALMIGHTY ALLAH have mercy on us all. Only lagosians observed real lockdown in Nigeria


In [5]:
for i in range(5):            
    print(i,": "+processed_train_data.loc[i,'clean_text'])

0 : smh give
1 : u human being think miss boat george
2 : old dirty trick trump like know fauci would say ha ha ha ha ha ha ha
3 : seriously worked 86 hour last check even come close
4 : may almighty allah mercy u lagosians observed real lockdown nigeria


In [6]:
processed_eval_data['clean_text'] = ComputeCleanTexts(processed_eval_data['text'])

285334it [00:47, 5958.86it/s]


### 3. Extract time from timestamp:

Using timestamp, one retrieves year, month day, week day and hour of publication of the tweet.

In [7]:
processed_train_data = ChangeTimeFormat(processed_train_data)

100%|██████████| 665777/665777 [01:11<00:00, 9369.07it/s] 


In [8]:
processed_eval_data = ChangeTimeFormat(processed_eval_data)

100%|██████████| 285334/285334 [00:28<00:00, 10027.05it/s]


### 4. Replace verified boolean by 0/1

In [9]:
processed_train_data['user_verified'] *= 1

In [10]:
processed_eval_data['user_verified'] *= 1

### 5. Count comma-separated fields

##### URLs

In [11]:
processed_train_data['url_count'] = CountCommaSeparated(processed_train_data, 'urls')

100%|██████████| 665777/665777 [00:00<00:00, 1221847.40it/s]


In [12]:
processed_eval_data['url_count'] = CountCommaSeparated(processed_eval_data, 'urls')

100%|██████████| 285334/285334 [00:00<00:00, 1217579.89it/s]


##### User mentions

In [13]:
processed_train_data['user_mentions_count'] = CountCommaSeparated(processed_train_data, 'user_mentions')

100%|██████████| 665777/665777 [00:00<00:00, 1337089.42it/s]


In [14]:
processed_eval_data['user_mentions_count'] = CountCommaSeparated(processed_eval_data, 'user_mentions')

100%|██████████| 285334/285334 [00:00<00:00, 1491460.25it/s]


##### Hashtags

In [15]:
processed_train_data['hashtag_count'] = CountCommaSeparated(processed_train_data, 'hashtags')

100%|██████████| 665777/665777 [00:00<00:00, 1605943.71it/s]


In [16]:
processed_eval_data['hashtag_count'] = CountCommaSeparated(processed_eval_data, 'hashtags')

100%|██████████| 285334/285334 [00:00<00:00, 1585198.80it/s]


### 6. Count text length

In [17]:
processed_train_data['text_length'] = CountSpaceSeperated(processed_train_data, 'text')

100%|██████████| 665777/665777 [00:01<00:00, 653432.02it/s]


In [18]:
processed_eval_data['text_length'] = CountSpaceSeperated(processed_eval_data, 'text')

100%|██████████| 285334/285334 [00:00<00:00, 557593.55it/s]


### 7. Save data (dataframe)

In [19]:
processed_train_data.to_pickle("Data/train_processed.pkl")

In [20]:
processed_eval_data.to_pickle("Data/eval_processed.pkl")

### 8. Save data (pytorch tensors)

In [21]:
train_tensor = torch.tensor(np.array(processed_train_data[['month', 'week_day', 'day', 'hour', 'user_verified', 'user_followers_count', 'user_friends_count', 'user_mentions_count', 'url_count', 'hashtag_count', 'text_length']]), dtype=torch.float64, device='cpu')
torch.save(train_tensor, '../Tensors/Training/12_features_tr.pt')

In [22]:
test_tensor = torch.tensor(np.array(processed_eval_data[['month', 'week_day', 'day', 'hour', 'user_verified', 'user_followers_count', 'user_friends_count', 'user_mentions_count', 'url_count', 'hashtag_count', 'text_length']]), dtype=torch.float64, device='cuda')
torch.save(test_tensor, '../Tensors/Testing/12_features_test.pt')

RuntimeError: No CUDA GPUs are available