# Vscode ENV problems

We noticed you're using a conda environment. If you are experiencing issues with this environment in the integrated terminal, we recommend that you let the Python extension change "terminal.integrated.inheritEnv" to false in your user settings.

# Package Installation

In [None]:
# uncomment to install packages for first time
'''
pip install -r requirements.txt
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# pip install gcld3
#
'''

In [87]:
import num2words

# Imports

In [1]:
import os
import os.path
import numpy as np
import string
import re
import csv
import codecs
import sys
import io

In [2]:
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

# Language Detection

In [3]:
from langdetect import detect

## Detecting language

In [11]:
def Detect_Lnaguage_Semantic(line):
    try:
        return detect(line)
    except:
        return np.nan

## Cleaning Language (semantic)

In [12]:
def Clean_Language_Sem_Singleline(line, language='en'):
    return[word for word in line if Detect_Lnaguage_Semantic(word) == language]

def Clean_Language_Sem_Multiline(lines_list, language='en'):
    return[Clean_Language_Sem_Singleline(line,language = language) for line in lines_list]

## Cleaning Language (syntatic)

In [13]:
def clean_non_english(word):
    return "".join([c for c in word if ord(c) < 128])

def Clean_Language_Syn_Singleline(line, language='en'):
    return[clean_non_english(word) for word in line]

def Clean_Language_Syn_Multiline(lines_list, language='en'):
    return[Clean_Language_Syn_Singleline(line,language = language) for line in lines_list]


## exp

In [124]:
#print(clean_non_english('Hello world'))

Hello world


# Removing handle mention

In [4]:
USER_REGEX_PATTERN = r"(@[((A-Za-z0-9).((?<=^)|(?<=)).(?=$)\w]+)"

def Remove_Mention_Singleline(word, replacer=" user"):       
    return re.sub(USER_REGEX_PATTERN, replacer, word)

def Remove_Mention_Multiline(lines_list, replacer = " user"):
    return [Remove_Mention_Singleline(line,replacer=replacer) for line in lines_list]

# Accented Char Removal

In [5]:
import unicodedata
import unidecode

In [6]:
def accent_rm_data_Singleline(line):
    return unicodedata.normalize('NFD', line).encode('ascii', 'ignore').decode('ascii')

def accent_rm_Singleline(line):
    return unidecode.unidecode(line)

def accent_rm_Multiline(lines_list, remover=1):
    if remover == 1:
        return [accent_rm_data_Singleline(line) for line in lines_list]
    else:
        return [accent_rm_Singleline(line) for line in lines_list]

# Tokenization

In [7]:
from nltk.tokenize import * #TweetTokenizer

In [8]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=False, strip_handles=False)

def Twik_Tweet_Tokenizer(pre_case=True, redu_len=False, strip_handle=True):
    global tokenizer
    tokenizer = TweetTokenizer(preserve_case=pre_case, reduce_len=redu_len, strip_handles=strip_handle)
    
Tokenizer_list={
    'TweetTokenizer':1,
    'WordTokenizer':2
}

In [9]:
# gets a line as a input and returns a list

def Tokenize_Singleline(line, Tokenizer=None):
    if Tokenizer == Tokenizer_list['TweetTokenizer']:
        return tokenizer.tokenize(line)
    elif Tokenizer == Tokenizer_list['WordTokenizer']:
        return word_tokenize(line)
    else:
        return casual_tokenize(line)

def Tokenize_Multiline(lines_list, Tokenizer=None):
    return [Tokenize_Singleline(line,Tokenizer) for line in lines_list]
    

## exp

In [77]:
#print(Tokenize_Singleline("@TargetZonePT :pouting_face: no he bloody isn't I was upstairs getting changed !",1))

['@TargetZonePT', ':pouting_face:', 'no', 'he', 'bloody', "isn't", 'I', 'was', 'upstairs', 'getting', 'changed', '!']


# HashTag Segmentation

In [10]:
from ekphrasis.classes.segmenter import Segmenter

In [11]:
segmenter = Segmenter(corpus = "twitter")
#segmenter = Segmenter(corpus = "english")

Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [12]:
def Hashtag_Segmentation_Single(line, replacer =''):
  returnlist = []
  for word in line:
    if word.startswith('#'):
      returnlist.append(replacer+segmenter.segment(word[1:]))
    else:
      returnlist.append(word)
  return returnlist

##

def Hashtag_Segmentation_Multi(lines_list):
  return [Hashtag_Segmentation_Single(line) for line in lines_list]


# Lexical Normilization

In [13]:
def Import_Lexical_Dict(lex_file_dir = 'LexicalNormalizationData.txt'):
    lexical_dict = {}
    with open(lex_file_dir, 'r') as f:
        lines = f.readlines()
        lexical_dict = {}
        for line in lines:
            data = line.split()
            lexical_dict[data[0]] = data[1]
    return lexical_dict

In [14]:
lexical_dictionary = Import_Lexical_Dict()

In [15]:
def  Lexical_Normilization_Singleline(line):
    returnlist = []
    for word in line:
        if word in lexical_dictionary:
            returnlist.append(lexical_dictionary[word])
        else:
            returnlist.append(word)
    return returnlist

def Lexical_Normilization_Multiline(lines_list):
    return [Lexical_Normilization_Singleline(line) for line in lines_list]


# Special Char Removal

In [16]:
_STRING_PUNCT = r"! \" # $ % & ' \( \) \* \+ , - . / : ; < = > \? @ \[ \\ \] ^ _ ` \{ | \} ~"
_CURRENCY = r"\$ £ € ¥ ฿ ₽ ﷼ ₴"
_PUNCTUATION = r". … , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ？ ！ ， 、 ； ： ～ · । ، ۔ ؛ ٪ % + - / = @ ^ | ~ "
_QUOTES = r'\' " " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 （ ） 〔 〕 【 】 《 》 〈 〉'
_NUMBER=r"0 1 2 3 4 5 6 7 8 9"
_SPECIAL_UNICODE = r"・ ・ ・ ⁰"

SP_CHAR_DICTIONARY = _CURRENCY + _PUNCTUATION + _QUOTES

In [17]:

def MK_SP_Char_dictionary(DataString):
    ReturnDict = {}
    i=0
    for value in DataString:
        ReturnDict[value]=i
        i+=1
    return ReturnDict


In [18]:
SP_CHAR_DICTIONARY = MK_SP_Char_dictionary(SP_CHAR_DICTIONARY)

In [19]:
def Twik_SP_CHAR_STRING(string_punct = False,currency = False,punctuation = False,quotes = False,number = False):
    global SP_CHAR_DICTIONARY,_STRING_PUNCT, _CURRENCY, _PUNCTUATION, _QUOTES, _NUMBER
    SP_CHAR_DICTIONARY = r''
    if string_punct:
        SP_CHAR_DICTIONARY+= _STRING_PUNCT
    if currency:
        SP_CHAR_DICTIONARY+= _CURRENCY
    if punctuation:
        SP_CHAR_DICTIONARY+= _PUNCTUATION
    if quotes:
        SP_CHAR_DICTIONARY+= _QUOTES
    if number:
        SP_CHAR_DICTIONARY+= _NUMBER
    #print(SP_CHAR_DICTIONARY)
    SP_CHAR_DICTIONARY = MK_SP_Char_dictionary(SP_CHAR_DICTIONARY)

In [20]:
# input type : list

def Special_Char_Remove_Singleline(line, ignor_date_time = True, SP_IGNOR = ['']):
    return_list = []
    for word in line:
        
        if word[0] in SP_IGNOR or word in SP_IGNOR:
            return_list.append(word)
            continue
        
        if ignor_date_time:
            if Check_Date_Time(word):
                return_list.append(word)
                continue
            else:
                if word not in SP_CHAR_DICTIONARY and word[0] not in SP_CHAR_DICTIONARY:
                    return_list.append(word)
        else:
            if word and word[0] not in SP_CHAR_DICTIONARY:
                    return_list.append(word)
    return return_list

# input type : list of list

def Special_Char_Remove_Multiline(lines_list, ignor_date_time = True):
    return [Special_Char_Remove_Singleline(line,ignor_date_time = ignor_date_time) for line in lines_list]


In [21]:
def Special_Char_Remove_From_String_Singleline(line, SP_IGNOR = []):
    return_string=""
    for character in line:
        if character not in SP_CHAR_DICTIONARY:
            return_string+=character
        elif character in SP_IGNOR:
            return_string+=character 
        else:
            return_string+=' '
    return return_string

def Special_Char_Remove_From_String_Multiline(lines_list,SP_IGNOR = [], ignor_date_time = False):
    return_list =[]
    for line in lines_list:
        if ignor_date_time:
            if Check_Date_Time(line):
                return_list.append(line)
                continue
            else:
                return_list.append(Special_Char_Remove_From_String_Singleline(line, SP_IGNOR = SP_IGNOR))
        else:
            return_list.append(Special_Char_Remove_From_String_Singleline(line, SP_IGNOR = SP_IGNOR))
    return return_list

# Spell Correction

In [22]:
#from ekphrasis.classes.spellcorrect import SpellCorrector
#from spellchecker import SpellChecker
from textblob import Word
from textblob import TextBlob

In [23]:
#spellcorrector = SpellCorrector(corpus = "english") # name --> ekpspc
#spellcorrector = SpellChecker() # name --> 'pys'
spellcorrector = None # name --> 'txb' 

In [45]:
# takes a list of words as input
def Spell_Correction_Singleline(line, corrector='txb',txb_word=True):
    if corrector == 'pys':                                          #pyspellchecker
        return [spellcorrector.correction(word) for word in line]  
    elif corrector == 'txb':                                        #TextBlob
        returnlist = []
        for word in line:
            if txb_word:
                fix = Word(word)
                fix = fix.correct()
            else:
                fix = TextBlob(word)
                fix = str(fix.correct())
            returnlist.append(fix)
        return returnlist
    else:
        return [spellcorrector.correct(word) for word in line]      #ekphrasis

# takes a list of lines as input
def Spell_Correction_Multiline(lines_list, corrector = 'txb'):
    return [Spell_Correction_Singleline(line,corrector = corrector) for line in lines_list]


# Stemming

In [24]:
import krovetzstemmer
from nltk.stem.snowball import SnowballStemmer

In [25]:
stemmer = krovetzstemmer.Stemmer()
#stemmer = SnowballStemmer(language = ref_file_name ,ignore_stopwords= ignor_stopwords)

In [26]:
def Stem_SingleToken_list(line):
    return [stemmer.stem(words) for words in line]

def Stem_MultiToken_list(lines_list):
    return [Stem_SingleToken_list(line) for line in lines_list]

In [24]:
def printSpace(maxvalue, words):
    print(" "*(maxvalue-(len(words))),end="")

# Lemmatization

In [27]:
import nltk
#nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
from nltk import pos_tag

In [28]:
# init lemmatizer
lemmatizer = WordNetLemmatizer()
POS_VALUES = {'r':'v', 'j': 'v', 'n':'n', 'v':'v'}

In [29]:
def Lemmatize_SingleToken_list(line):
    returnlist = []
    global POS_VALUES
    for words, pos_value in pos_tag(line):
        pos = pos_value[0].lower()
        if pos not in POS_VALUES:
            pos = 'n'
        else:
            pos = POS_VALUES[pos]
        #print(pos)
        returnlist.append(lemmatizer.lemmatize(words,pos))
    return returnlist

def Lemmatize_MultiToken_list(lines_list):
    return [Stem_SingleToken_list(line) for line in lines_list]

# Stop-Word Removal

In [30]:
from nltk.corpus import stopwords

In [31]:
# from file
def Import_Stop_Word(St_wrd_file_dir = 'indris', from_file=False):
    if from_file:
        with open(St_wrd_file_dir,'r') as f:
            lines = [line.replace('\n','') for line in f.readlines()]
            return set(lines)
    else:
       return set(stopwords.words(St_wrd_file_dir)) # 'english' , 'indris'

In [32]:
Stop_Words = Import_Stop_Word(St_wrd_file_dir = 'english',from_file= False) # while importing fromfile use {"filename.txt" and from_file=True}

In [33]:
def rm_Stopword_Singleline(Line_Token):
    return_list = []
    for line in Line_Token:
        for word in line.split():
            if word not in Stop_Words:
                return_list.append(word)
    return return_list

def rm_Stopword_Multiline(lines_list):
    return[rm_Stopword_Singleline(line) for line in lines_list]

# URL And Email Identification, Expansion/Normalization

In [34]:
URL_PATTERN = r'https?://\S+|www\.\S+'
EXPANSION_ERROR = r'__CONNECTION\S+|__CLIENT\S+'

## URL

### Normalization

In [35]:
def Normalize_urls_Singleline(line, replace_pattern = "urladd"):
    return[re.sub(pattern= URL_PATTERN, repl = replace_pattern, string = word) for word in line]

def Normalize_urls_Multiline(lines_list,replace_pattern = "urladd"):
    return[Normalize_urls_Singleline(line,replace_pattern = replace_pattern) for line in lines_list]

### Expansion

In [36]:
import urlexpander

In [37]:
def Expand_urls_Singleline(line):
    return_list =[]
    for word in line:
        if re.search(URL_PATTERN,word):
            if urlexpander.is_short(word):
                return_list.append(re.sub(pattern= EXPANSION_ERROR, repl="",string = urlexpander.expand(word)))
            else:
                return_list.append(word)
        else:
            return_list.append(word)
    return return_list

def Expand_urls_Multiline(lines_list):
    return [Expand_urls_Singleline(line) for line in lines_list]


## Email

In [36]:
EMAIL_PATTERN = r'^(?:(?!.*?[.]{2})[a-zA-Z0-9](?:[a-zA-Z0-9.+!%-]{1,64}|)|\"[a-zA-Z0-9.+!% -]{1,64}\")@[a-zA-Z0-9][a-zA-Z0-9.-]+(.[a-z]{2,}|.[0-9]{1,})$'

In [37]:
def Normalize_emails_Singleline(line, replace_pattern = "urladd"):
    return[re.sub(pattern= EMAIL_PATTERN, repl = replace_pattern, string = word) for word in line]

def Normalize_emails_Multiline(lines_list,replace_pattern = "urladd"):
    return[Normalize_urls_Singleline(line,replace_pattern = replace_pattern) for line in lines_list]

# Expanding Contractions

In [38]:
import contractions

In [39]:
def Expand_Contraction_Singleline(line):
    return [contractions.fix(word) for word in line]

def Expand_Contraction_Miltiline(lines_list):
    return [Expand_Contraction_Singleline(line) for line in lines_list]

# Chat Word (Short & Slang Words) Conversion

In [40]:
def Import_Chat_Short_and_Slang_Word(file_dir, split_term = ' '):
    return_dict = {}
    with open(file_dir,'r') as f:
        for line in f.readlines():
            data = line.split(split_term)
            return_dict[data[0].lower()] = (data[1].replace('\n','')).lower()
    return return_dict

In [41]:
Chat_Word_Dictionary = Import_Chat_Short_and_Slang_Word('Chat_Word.txt', split_term = '=') #https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt
                                                                                           # https://www.webopedia.com/reference/text-abbreviations/

In [61]:
def  Chat_Word_Conversion_Singleline(line):
    returnlist = []
    for word in line:
        if word in Chat_Word_Dictionary:
            returnlist.append(Chat_Word_Dictionary[word])
        else:
            returnlist.append(word)
    return returnlist

def Chat_Word_Conversion_Multiline(lines_list):
    return [Lexical_Normilization_Singleline(line) for line in lines_list]

# Date Time Checking and Customization

## Checking

In [42]:
from dateutil.parser import parse

In [43]:
def Check_Date_Time(datastring, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    if datastring.isdigit():
        return False
    try: 
        parse(datastring, fuzzy=fuzzy)
        return True

    except ValueError:
        return False


## Deletion

In [44]:
def Remove_Date_Time_Singleline(line, Fuzzy = False):
    return[word for word in line if not Check_Date_Time(word, fuzzy = Fuzzy)]

def Remove_Date_Time_Multiline(lines_list, Fuzzy = False):
    return[Remove_Date_Time_Singleline(line, Fuzzy = Fuzzy) for line in lines_list]

# Number to Word

In [45]:
from num2words import num2words

In [46]:
__NUMBERS_= ['0','1','2','3','4','5','6','7','8','9']

def isdigit_func(word):
    for chr in word:
        if chr not in __NUMBERS_:
            return False
    return True

In [47]:
def Number_to_Word_Singleline(line, replacer = ""):
    return_list = []
    for word in line:
        if isdigit_func(word):
            #print(word)
            return_list.append(num2words(int(word)).replace('-',replacer))
        else:
            return_list.append(word)
    return return_list

def Number_to_Word_Multiline(lines_list, replacer = ""):
    return [Number_to_Word_Singleline(line, replacer=replacer) for line in lines_list]


## exp

In [149]:
#print(num2words(int('96')).replace('-',' '))

# Emojis and Emoticons to Text Conversion and Deletion

In [48]:
import emot

In [49]:
emo_converter = emot.emot()

## Emo -ji/ticons Text Conversion

In [50]:
def Translate_Emoticons(word):
    data = emo_converter.emoticons(word)
    if data['flag']:
        word = data['mean'][0]
    return [data['flag'],word]
    

In [51]:
EXTRA_EMOTICONS = set([':p'])

In [52]:
def Emoji_Emoticon_to_Text_Singleline(line, check_date_time = False):
    return_list = []
    data = []
    #print(line)
    for word in line:
        if not word:
            continue
        if check_date_time:
            if str(word[0]).isdigit():
                if Check_Date_Time(word,fuzzy=True):
                    return_list.append(word)
                    continue

        if word in EXTRA_EMOTICONS:
            data = Translate_Emoticons(word.upper())
        else:
            data = Translate_Emoticons(word)
            
        if data[0]:
            return_list.append(data[1])
        else:
            data = emo_converter.emoji(word)
            
            if data['flag']:
                return_list.append(data['mean'][0].replace(':',''))
            else:
                return_list.append(word)
    return return_list

def Emoji_Emoticons_to_Text_Multiline(lines_list):
    return[Emoji_Emoticon_to_Text_Singleline(line) for line in lines_list]


## Removing Emoji-Emoticons

In [53]:
def Emoji_Emoticon_Remove_Singleline(line, check_date_time = False):
    return_list = []

    for word in line:
        if check_date_time:
            if str(word[0]).isdigit():
                if Check_Date_Time(word,fuzzy=True):
                    return_list.append(word)
                    continue

        data = emot.emoticons(word)
        check =False

        if type(data) == type([]):
            check = data[0]['flag']
        else:
            check = data['flag']

        if check:
            continue
        else:
            data = emot.emoji(word)
            
            if data['flag']:
                continue
            else:
                return_list.append(word)
    return return_list

def Emoji_Emoticons_Remove_Multiline(lines_list):
    return[Emoji_Emoticon_to_Text_Singleline(line) for line in lines_list]


## Text to Emoji/Emoticons conversion

In [1]:
from emot import EMOTICONS_EMO,EMOJI_UNICODE

In [55]:
#only emojis

def Text_to_Emoji_Emoticons_Singleline(line):
    line = line.split()
    returnstring = ""
    for words in line:
        if words in EMOJI_UNICODE:
            returnstring+=" "+EMOJI_UNICODE[words]
        else:
            returnstring+=" "+words
    return returnstring

## exp

In [None]:
#print(Emoji_Emoticon_to_Text_Singleline([":p"], check_date_time = False))

# Removing Extra Spaces and Single Character

In [56]:
import re
SPACE_PATTERN = r' +'#r'\s+'
SINGLE_CHAR_RM = r"((?<=^)|(?<= )).((?=$)|(?= ))"

In [57]:
#input -> String
def Remove_Extra_Spaces_and_char_Singleline(line, RM_SINGLE_CHAR = False):
    if RM_SINGLE_CHAR:
        return re.sub(SPACE_PATTERN, " ", re.sub(SINGLE_CHAR_RM, '', line).strip())
    return re.sub(pattern = SPACE_PATTERN, repl =" ", string = line)

#input -> list of string
def Remove_Extra_Spaces_and_char_Multiline(lines_list):
    return[Remove_Extra_Spaces_and_char_Singleline(line) for line in lines_list]

# Removing Repeated Words

In [58]:
def RM_Repeated_Word_Singleline(line):
    return ' '.join(dict.fromkeys(line.split()))
    
def RM_Repeated_Word_Multilineline(lines_list):
    return[RM_Repeated_Word_Singleline(line) for line in lines_list]

# Preprocessing Module

## twiking

In [63]:
#Twik_SP_CHAR_STRING(string_punct = False,currency = True,punctuation = True,quotes = True,number = True)
Twik_Tweet_Tokenizer(pre_case=True, redu_len=True, strip_handle=False)

## processing

In [59]:
SP_IGNOR_LIST = [''] #"'"

def PreProcessingModule(line):

    line = accent_rm_data_Singleline(line) # in/out -> string
    #print("Accent removal: ",line)

    line = Remove_Mention_Singleline(line) # in/out -> string
    #print("Removing Mention: ",line)

    line = Text_to_Emoji_Emoticons_Singleline(line) # in/out -> string
    #print("EMoji Emoticon: ",line)
    
    line = Expand_Contraction_Singleline([line]) #in/out ->list with only one element
    #print("Expand Contraction: ", line)


    line = line[0] # in->list with only one element // out-> string
    #print(line)

    line = Tokenize_Singleline(line,1) #in/out -> list
    #print("Tokenization: ", line)

    line = Normalize_urls_Singleline(line, replace_pattern = 'url')    
    #print("URL normalization: ",line)
        
    '''
    Text_blob_Word = False
    if Text_blob_Word:
        line = Spell_Correction_Singleline(line)
    else:
        line = [" ".join(line)]
        line = Spell_Correction_Singleline(line,txb_word=False)
        #print(line)
        line = Tokenize_Singleline(line[0],1)
    '''
    #print(line)

    line = Hashtag_Segmentation_Single(line, replacer ='')
    #print("Hashtag Segmentation: ",line)

    line = Emoji_Emoticon_to_Text_Singleline(line,check_date_time=True)
    #print("Emoji Emoticon to Text: ",line)

    line = Special_Char_Remove_Singleline(line, SP_IGNOR = [''])
    #print("Special char Removal: ",line)

    line = Lexical_Normilization_Singleline(line)
    #print("Lexical Normalization: ", line)

    line = Chat_Word_Conversion_Singleline(line)
    #print(line)

    line = Special_Char_Remove_From_String_Multiline(line, SP_IGNOR = SP_IGNOR_LIST, ignor_date_time = False)
    #print(line)

    #line = Remove_Extra_Spaces_Multiline(line)
    #print(line)

    line = " ".join(line).split()


    line = Number_to_Word_Singleline(line, replacer =" ")
    #print("Number to word: ",line)

    line = Lemmatize_SingleToken_list(line)

    line = Stem_SingleToken_list(line)
    #print("Stemming: ",line)

    line = Lemmatize_SingleToken_list(line)
    #print(line)
    
    line = rm_Stopword_Singleline(line)
    #print(line)
    
    #line = Remove_Date_Time_Singleline(line)
    #print(line)

    line = Remove_Extra_Spaces_and_char_Singleline(" ".join(line), RM_SINGLE_CHAR = True)#.split()

    line = RM_Repeated_Word_Singleline(line)
    
    return line

# Load Data

In [67]:
from tqdm import tqdm

In [68]:
DATA_DIRECTORY = '''../../DataSets/SemEval2018-IronyDetection/SemEval2018-IronyDetection.txt'''  #SmallVersion

In [70]:
TweetsTextData = []
Rawline = []
with open(DATA_DIRECTORY, 'r') as f:
    next(f)
    lines = [line.split('\t') for line in f]
    global TweetsTextData, Rawline
    TweetsTextData=[]
    Rawline = []
    PreProcessedData = []
    i=0
    #for line in tqdm(csv.reader(lines,dialect = "excel-tab"), total=len(lines)):
    for line in tqdm(lines, total=len(lines)):
        #i+=1
        #print(i)
        #try:
        #print("before:",line[2])
        PreProcessedData =PreProcessingModule((line[2]))
        Rawline.append(line[2])
        TweetsTextData.append(PreProcessedData)
        #print("after: ",PreProcessedData)
        #except:
         #   print(line[2])
          #  print(PreProcessedData)
        #print()
        #TweetsTextData.append(PreProcessedData)

100%|██████████| 3834/3834 [00:07<00:00, 483.75it/s]


In [None]:
del f

# test

In [None]:
print(Tokenize_Singleline('"@myrcurial: @amz__123 Aaaaaamd what time is your bedtime? >better  tweet Dad at night from my house. I don\'t want to get in trouble!',1))

In [None]:
print(Rawline[3027])
print(TweetsTextData[3027])
#print(len(TweetsTextData))

In [None]:
print(Special_Char_Remove_Singleline(['Aaaamd'], SP_IGNOR = ['']))

In [None]:
testline = '''"@myrcurial: @amz__123 Aaaaaamd what time is your bedtime? >better  tweet Dad at night from my house. I don\'t want to get in trouble!'''
print(testline)
#print(Tokenize_Singleline(testline,1))
print(PreProcessingModule(testline))

# Main Module