In [1]:
import pandas as pd
import re
import contractions

In [17]:
PUNCTUATION = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [2]:
STOPWORDS = {'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [3]:
df = pd.read_excel('data/Full dataset.xlsx')

In [4]:
df = df[['title', 'content', 'Label']]

In [5]:
df.head()

Unnamed: 0,title,content,Label
0,\nGovernment Must Provide Security for AML in ...,"Last week, armed men carrying machetes and oth...",1.0
1,\nLiberia Army Chief Of Staff Predicts Unfores...,"Amid recent coup in Niger, the Chief Staff of ...",1.0
2,"\nGuns & Machetes: Men Attack, Vandalize Arcel...",A group of unknown men said to be from Zolowee...,1.0
3,\nPresident Weah commits to delivering violen...,"Liberian President, George Weah, has pledged t...",1.0
4,\nNew World Bank Liberia Country Manager Arrives,"MONROVIA, July 31, 2023 â€“ The new World Bank...",1.0


In [6]:
df.title.isna().mean()

0.016779021768182224

In [7]:
df.dropna(inplace=True)

In [8]:
df.title.str.len().quantile(0.05)

24.0

In [9]:
df.title.str.len().quantile(0.95)

126.0

In [10]:
# del short titles 
df.drop(df[df.title.str.len() < 24].index, inplace=True)

In [11]:
# del very long titles
df.drop(df[df.title.str.len() > 126].index, inplace=True)

In [12]:
df.title.str.len().describe()

count    29396.000000
mean        57.699585
std         23.020405
min         24.000000
25%         39.000000
50%         53.000000
75%         73.000000
max        126.000000
Name: title, dtype: float64

In [13]:
# drop titles and content that contain only digits
df.drop(df[df.title.astype(str).str.isnumeric()].index, inplace=True)
df.drop(df[df.content.astype(str).str.isnumeric()].index, inplace=True)

In [14]:
#remove not ascii
def delete_not_ascii(text):
    text = re.sub(r'[^\x00-\x7f]', '', text)
    return text 

#remove urls
def remove_urls(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    return text

#remove numbers, replace it by NUMBER
def remove_number(text):
    num = re.compile(r'[-+]?[.\d]*[\d]+[:,.\d]*')
    return num.sub(r' NUMBER ', text)

#remove punctuation
def remove_punct(text):
    text = re.sub('[' + PUNCTUATION +']', ' ', text)
    return text     

# remove line breaks and extra spaces
def clean_text(text):
    text = re.sub(r'\n',' ', text) # Remove line breaks
    text = re.sub('\s+', ' ', text).strip() # Remove leading, trailing, and extra spaces
    return text

def convert_lower_case(text):
    return text.lower()

#contractions and remowing stopwords and very short words
def string_contractions(text):
    string_wothout_contractions = ''
    expanded_words = []  
    for each_word in text.split():
        if each_word not in (STOPWORDS):
            if len(each_word) >= 2:
                expanded_words.append(contractions.fix(each_word))
    string_wothout_contractions = ' '.join(expanded_words)
    return string_wothout_contractions

In [15]:
def basic_cleaning(df, column_name):
    #df = copy.deepcopy(df_in)
    df[column_name] = df[column_name].apply(delete_not_ascii)
    df[column_name] = df[column_name].apply(remove_urls)
    df[column_name] = df[column_name].apply(remove_number)
    df[column_name] = df[column_name].apply(convert_lower_case)
    df[column_name] = df[column_name].apply(string_contractions)
    df[column_name] = df[column_name].apply(remove_punct)
    df[column_name] = df[column_name].apply(clean_text)

In [18]:
basic_cleaning(df, 'title')

In [19]:
basic_cleaning(df, 'content')

In [20]:
df.head()

Unnamed: 0,title,content,Label
0,government must provide security aml wake arm ...,last week armed men carrying machetes deadly w...,1.0
1,liberia army chief staff predicts unforeseen a...,amid recent coup niger chief staff liberian ar...,1.0
2,guns machetes men attack vandalize arcelormitt...,group unknown men said zolowee town proximity ...,1.0
3,president weah commits delivering violent free...,liberian president george weah pledged deliver...,1.0
4,new world bank liberia country manager arrives,monrovia july number number new world bank cou...,1.0


In [21]:
df.Label.value_counts()/len(df)

Label
1.0    0.855316
0.0    0.144684
Name: count, dtype: float64

In [22]:
df.to_csv('clean_data_title_content.csv')