In [1]:
import os
import re
import csv
import sys
import nltk
import string
import chardet
import numpy as np
import pandas as pd
from autocorrect import spell
from nlppreprocess import NLP
from textblob import Word, TextBlob
from nltk.stem import PorterStemmer
from nltk.stem.isri import ISRIStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, webtext
from tashaphyne.stemming import ArabicLightStemmer

In [2]:
def one_file_preprocess(Inputfile):
    '''
    The function used to read CSV file and convert the read tweets from this file to list.
    After that the function path the list of tweets to the Cleaning Arabic pipeline.
    Argument:
        Inputfile: The file path that you need to read.
    Return:
        The tweets contained in this file as one list that contain all tweets.
    
    '''
    df_file = pd.read_csv(Inputfile,lineterminator='\n', error_bad_lines=False, encoding='utf-8')
    print(len(df_file))
    full_text_list = list(df_file['full_text'])
    full_text_list_preprocessed = arabic_pip_line(full_text_list)
    return full_text_list_preprocessed

In [3]:
def read_direction_preprocess(direction_path):
    '''
    Loop over each direction and path each file in this direction to get all tweets,
    then to save the new preprocessed data which cleaned tweets we have make another dierction,
    to save the data in for each file create the same file name and save in another dierction.
    Argument:
        direction_path: The dierction you aimed to read files from.
    '''
    try:
        all_path_files = os.listdir(direction_path)
        for file_path in all_path_files:
            file_path = direction_path + file_path
            full_text_list_preprocessed = one_file_preprocess(file_path)
            file_path = file_path.split('/')
            file_path[1] = 'preprocessed'
            file_path = '/'.join(file_path)
            dict_full_text_list_preprocessed = {'full_text': full_text_list_preprocessed}
            dict_full_text_list_preprocessed = pd.DataFrame(dict_full_text_list_preprocessed)
            file_path = file_path[:-7]
            file_path = file_path + 'csv'
            dict_full_text_list_preprocessed.to_csv(file_path, index=False)
    except Exception as e:
        file = open("logs/direction_and_file_handleing_file.log","+a")
        file.write("This error related to function read_direction_preprocess of direction_and_file_handleing file n"
                   + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return True

In [4]:
def handle_direction_analysis(direction, replace_with):
    '''
    Use one default direction and replace this with others like below:
    "COVID-19-Arabic-Tweets-Dataset/COVID19-tweetID-2020-01/" - to
    "COVID-19-Arabic-Tweets-Dataset/COVID19-tweetID-2020-02/"
    Argument:
        direction: the folder path you need to change.
        replace_with: replace the folder name.
    reutrn:
        The new directions we need to work with
    '''
    direction = re.sub('([1-9]/)', replace_with , direction)
    return direction

In [5]:
def one_file_analysis(Inputfile):
    '''
    The function to get all rows in file in one list
    Argument:
        Inputfile: The file path we aimed to get all of its strings.
    Return:
        full_text_list: retrun these strings as a list each index represent one string.
    '''
    df_file = pd.read_csv(Inputfile,  lineterminator='\n',  error_bad_lines=False, encoding='utf-8')
    full_text_list = list(df_file['full_text'])
    return full_text_list

In [6]:
def read_direction_analysis(direction_path, convert_list = "to_list_of_words"):
    '''
    Loop over each direction and path each file in this direction, then get the tweets of one path as a list,
    then get all of the words of this list, and extend this list to contain all words in all files of one direction.
    Argument:
        direction_path: The dierction you aimed to read files from.
        convert_list: default argument used when it's required.
    Return:
        All the words in all files of one direction
    '''
    try:
        all_path_files = os.listdir(direction_path)
        all_words_of_direction = []
        word_list = []
        for file_path in all_path_files:
            file_path = direction_path + file_path
            full_text_list = one_file_analysis(file_path)
            
            if convert_list == "to_list_of_words":
                full_text_list = convert_list_of_strings_to_list_of_words(full_text_list)
            
            # extend the list of words of previous files to new words of another file
            all_words_of_direction.extend(full_text_list) 
            
#             print("The total words now are: ", len(all_words_of_direction))
#     pull the error to logs direction
        
    except Exception as e:
        print("="*50)
        file = open("logs/direction_and_file_handleing.log","+a")
        file.write("This error related to function read_direction_analysis of direction_and_file_handleing file n" 
                   + str(e) + "\n" + "#" *99 + "\n") # "#" *99 as separated lines
    return all_words_of_direction

In [7]:
def shuffle_dropna_separate_split(df_file, tweet_text, tweet_class):
    df_file.dropna(inplace=True)
    df_file = df_file.sample(frac=1).reset_index(drop=True) ## Shuffle tweets
    print("The number of rows in this file are: ", len(df_file))
    print("The columns are: ", df_file.columns)
    print("The number of class 1 which represent this tweet talks about cron: ", len(df_file[df_file['class'] == 1]))
    print("The number of class 0 which represent this tweet not talks about cron: ", len(df_file[df_file['class'] == 0]))
    
      # Separate tweets and target classification 
    tweets_text = df_file[tweet_text] 
    tweets_class = df_file[tweet_class]

    ## Split data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(tweets_text, tweets_class, test_size=.2)
    X_train[:5]

    # Display 
    print("Our training data now are: " + str(len(X_train))  + " Tweets")
    print("Our testing data now are: " + str(len(X_test))  + " Tweets")
    print("Our training data now are: " + str(len(y_train))  + " labels")
    print("Our testing data now are: " + str(len(y_test))  + " labels")
    
    return X_train, X_test, y_train, y_test