In [2]:
import re
import csv
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification


# 0. Fix mixed datatypes in pd.DataFrame


In [24]:
# 1. Choose columns we are interested in and state expected 
dict_cols = {'self_text': str, 'created_time': 'datetime[ns]', 'post_self_text': str, 'post_title': str, 'post_created_time': 'datetime[ns]'}

# 2. Open the file and read data
file = "pre_processed.csv"
df = pd.read_csv(file, usecols=list(dict_cols.keys())) 
print(f"Choosing columns...\n {list(df.columns)}")

Choosing columns...
 ['created_time', 'self_text', 'post_self_text', 'post_title', 'post_created_time']


In [25]:
# 3. Verify if columns have mixed types
def find_mixed_type_columns(dataframe):
    mixed_type_columns = []
    for column in dataframe.columns:
        # Use a set to track unique types in the column
        types = set()
        for v in dataframe[column]:
            types.add(type(v))  # Add the type of each element to the set
        if len(types)>1:
            mixed_type_columns.append((column, types))
    return mixed_type_columns
    
mixed_one = find_mixed_type_columns(df)
print("Before Columns with mixed types:", mixed_one)
# ## [('self_text', {<class 'str'>, <class 'float'>}), ('post_self_text', {<class 'str'>, <class 'float'>}), 
# ## ('post_title', {<class 'str'>, <class 'float'>}), ('post_created_time', {<class 'str'>, <class 'float'>})]

Before Columns with mixed types: [('self_text', {<class 'str'>, <class 'float'>}), ('post_self_text', {<class 'str'>, <class 'float'>}), ('post_title', {<class 'str'>, <class 'float'>}), ('post_created_time', {<class 'str'>, <class 'float'>})]


In [26]:
# 4. Fix mixed types
df.replace('nan', pd.NA, inplace=True)
df['post_created_time'] = pd.to_datetime(df['post_created_time'], errors='coerce')
df['created_time'] = pd.to_datetime(df['created_time'], errors='coerce')

text_columns = ['self_text', 'post_self_text', 'post_title']
for col in text_columns:
    df[col] = df[col].fillna('')

# Drop rows where any of the specified columns have NaN/NaT values
df.dropna(subset=text_columns + ['post_created_time'], inplace=True)

# Verify the changes
mixed_final = find_mixed_type_columns(df)
print("\nFinal Columns with mixed types:", mixed_final)


Final Columns with mixed types: []


In [28]:
df
df.head()

df.to_csv('final_data.csv', index=False)

Unnamed: 0,created_time,self_text,post_self_text,post_title,post_created_time
0,2024-03-19 19:18:08,What is this supposed to prove? That nobody in...,"So, Mr. Netanyahu, and supporters, are you goo...",article: Famine in northern Gaza is imminent a...,2024-03-19 17:48:02
1,2024-03-19 19:17:51,If only there was an armed military force near...,,Local Gaza gangs are ‘robbing displaced Palest...,2024-03-19 13:25:00
2,2024-03-19 19:17:35,Taliban created and funded by america and isis...,Most pro Palestine supporters who are not Arab...,Why is the West so concerned?,2024-03-19 17:45:08
3,2024-03-19 19:17:25,Recycling glass is still not economical or par...,,The plastic industry knowingly pushed recyclin...,2024-03-19 17:51:10
4,2024-03-19 19:17:22,The two have a private direct communication li...,,Biden privately told Netanyahu he's not trying...,2024-03-19 19:00:41
