In [64]:
import re
import numpy as np 
import pandas as pd 

from googletrans import Translator
from langdetect import detect


##### Political DataFrame

In [65]:
imran_pti_df = pd.read_csv("dataset/@ImranKhanPTI_848_translate.csv")
cm_shahbaz_df = pd.read_csv("dataset/@CMShehbaz_864_translate.csv")
# imran_riaz_df = pd.read_csv("dataset/@ImranRiazKhan_919.csv")
pti_official_df = pd.read_csv("dataset/@PTIofficial_841_translate.csv")


political_df = pd.concat([imran_pti_df, cm_shahbaz_df, pti_official_df])
political_df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tweets,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,0,"Our completely unarmed, brave and passionate w...",,,
1,1,1,Today would have been Arshad Sharif's 50th bir...,,,
2,2,2,After what happened to Arshad Sharif despite h...,,,
3,3,3,& bring about substantive structural reforms t...,,,
4,4,4,"As I predicted 6 months ago, the conspiracy of...",,,


In [66]:
political_df.drop(["Unnamed: 0", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4", "Unnamed: 0.1"], axis=1, inplace=True)
political_df.dropna(axis=0, inplace=True)
political_df.head()

Unnamed: 0,Tweets
0,"Our completely unarmed, brave and passionate w..."
1,Today would have been Arshad Sharif's 50th bir...
2,After what happened to Arshad Sharif despite h...
3,& bring about substantive structural reforms t...
4,"As I predicted 6 months ago, the conspiracy of..."


In [67]:
hashtag_regex = re.compile(r'\#\w+')
link_regex = re.compile(r'http\S+')
mention_regex = re.compile(r'@\w+')

def clean_sentence(df):
    df['Tweets'] = df['Tweets'].str.replace(hashtag_regex, '')
    df['Tweets'] = df['Tweets'].str.replace(link_regex, '')
    df['Tweets'] = df['Tweets'].str.replace(mention_regex, '')
    df['Tweets'] = df['Tweets'].str.replace("\n", '')
    return df

political_df = clean_sentence(political_df)
political_df.head(5)

Unnamed: 0,Tweets
0,"Our completely unarmed, brave and passionate w..."
1,Today would have been Arshad Sharif's 50th bir...
2,After what happened to Arshad Sharif despite h...
3,& bring about substantive structural reforms t...
4,"As I predicted 6 months ago, the conspiracy of..."


In [68]:
political_df['label'] = ["political" for _ in range(len(political_df['Tweets']))]

In [69]:
political_df.to_csv(f"clean_dataset/clean_political_{len(political_df)}.csv")

##### Religious DataFrame

In [80]:
islamic_df = pd.read_csv("dataset/@islamicstrength_739.csv")
menk_df = pd.read_csv("dataset/@muftimenk_835.csv")
pop_df = pd.read_csv("dataset/@Pontifex_847.csv")

In [81]:
religious_df = pd.concat([islamic_df, menk_df, pop_df])
religious_df.drop(["Unnamed: 0"], axis=1, inplace=True)
print(len(religious_df))
religious_df.head()

2421


Unnamed: 0,Tweets,label
0,,religious
1,"O Allah, remove all arrogance and selfishness ...",religious
2,Allah Has Planned It Beautifully. So Wait Pati...,religious
3,We strive to make sure people have something t...,religious
4,The moment you share something online. \nYou h...,religious


In [82]:
religious_df.dropna(axis=0, inplace=True)
print(len(religious_df))
religious_df.head()

2420


Unnamed: 0,Tweets,label
1,"O Allah, remove all arrogance and selfishness ...",religious
2,Allah Has Planned It Beautifully. So Wait Pati...,religious
3,We strive to make sure people have something t...,religious
4,The moment you share something online. \nYou h...,religious
5,If it's stopping you from getting closer to Al...,religious


In [83]:
religious_df = clean_sentence(religious_df)
religious_df.head(5)

Unnamed: 0,Tweets,label
1,"O Allah, remove all arrogance and selfishness ...",religious
2,Allah Has Planned It Beautifully. So Wait Pati...,religious
3,We strive to make sure people have something t...,religious
4,The moment you share something online. You hav...,religious
5,If it's stopping you from getting closer to Al...,religious


In [84]:
religious_df.to_csv(f"clean_dataset/clean_religious_{len(religious_df)}.csv")

#### Merge All Class Dataset

In [85]:
df = pd.concat([political_df, religious_df])
print(len(df))
df.head(5)

4973


Unnamed: 0,Tweets,label
0,"Our completely unarmed, brave and passionate w...",political
1,Today would have been Arshad Sharif's 50th bir...,political
2,After what happened to Arshad Sharif despite h...,political
3,& bring about substantive structural reforms t...,political
4,"As I predicted 6 months ago, the conspiracy of...",political


In [86]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Tweets,label
0,"To the law enforcement agencies, doctors, para...",political
1,"Rather, just for the support and pleasure of t...",political
2,І не забуваймо багатостраждальний український ...,religious
3,Balloki Nature Reserve initiated in 2019 by ta...,political
4,I extend my profound condolences on the sad de...,political


In [87]:
df.to_csv(f"clean_dataset/clean_dataset_{len(df)}.csv")