# Loading libraries

In [1]:
import numpy as np
import pandas as pd
import os
from os import path as osp
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 


from nltk.tokenize import word_tokenize

In [2]:
## Main text file was seperated into three files based on the content

In [3]:
file_name = '../data/assignment_data.txt'

assignment_file = "../data/student_corse_feedback.txt"
twitter_data = "../data/twitter_data.txt"
research_file = "../data/research_data.txt"

preview_length = 100

output_location = '../output'

class TEXT:
    rsh = 'research'
    twt = 'twitter'
    asn = 'assignment'

# Read data 

In [4]:
data = {}
data['research'] = open(research_file, mode = 'r', encoding='utf-8').read()
data['twitter'] = open(twitter_data, mode = 'r', encoding='utf-8').read()
data['assignment'] = open(assignment_file, mode = 'r', encoding='utf-8').read()

In [5]:
data['research'][:preview_length]

'Neural network models have shown their promising opportunities for multi-task\nlearning, which focus '

In [6]:
data['twitter'][:preview_length]

'Reminds me of Liberal Immigration Fraudster Monsef avoiding deportation from Canada. #cdnpoli #LPC #'

In [7]:
data['assignment'][:preview_length]

'Honestly last seven lectures are good. Lectures are understandable. Lecture slides are very useful t'

In [8]:
def write_to_file(text_lines, file_name):
    full_path = osp.join(output_location, file_name)
    dir_path = osp.dirname(full_path)
    if not osp.exists(dir_path):
        os.makedirs(dir_path)
        
    with open(full_path, 'w') as fp:
#         print(text_lines)
        for line in text_lines:
#             print(line)
            fp.write(line + ' ')
    print("Written to {}".format(full_path))

# 1. Tockenize

In [9]:
def tockenize_text(words):
    return word_tokenize(words)

## 1.1 Student Course Feedback

In [10]:
student_feedback_tockenize = tockenize_text(data[TEXT.asn])
write_to_file(student_feedback_tockenize, 'tockenize/{}_output.txt'.format(TEXT.asn))

Written to ../output/tockenize/assignment_output.txt


## 1.2 Twitter Feed

In [11]:
twitter_data_tockenize = tockenize_text(data[TEXT.twt])
write_to_file(twitter_data_tockenize, 'tockenize/{}_output.txt'.format(TEXT.twt))

Written to ../output/tockenize/twitter_output.txt


## 1.3 Research Paper

In [12]:
research_data_tockenize = tockenize_text(data[TEXT.rsh])
write_to_file(research_data_tockenize, 'tockenize/{}_output.txt'.format(TEXT.rsh))

Written to ../output/tockenize/research_output.txt


# 2. Isolated word correction

In [13]:
def spell_ckecking(tockens):
    spell = SpellChecker(distance=2)
    mispelled = spell.unknown(tockens)
    pairs = []
    print("Mispelled count: {}".format(len(mispelled)))
    for i, word in enumerate(mispelled):
        correction = spell.correction(word)
        print('{:2} - "{}" is corrected as "{}"'.format(i, word, correction))
        pairs.append((word, correction))
        if i == 5:
            break
    return pairs

## 2.1 Student Course Feedback

In [14]:
word_pairs = spell_ckecking(student_feedback_tockenize)

Mispelled count: 14
 0 - "''" is corrected as "d'"
 1 - "speed.a" is corrected as "speed"
 2 - "examples.lectures" is corrected as "examples.lectures"
 3 - "undersatand" is corrected as "understand"
 4 - ".." is corrected as "p."
 5 - "class.it" is corrected as "classic"


## 2.2 Twitter Feed

In [15]:
word_pairs = spell_ckecking(twitter_data_tockenize)

Mispelled count: 87
 0 - "//t.co/uibsezoqas" is corrected as "//t.co/uibsezoqas"
 1 - "c��_" is corrected as "c��_"
 2 - "https" is corrected as "steps"
 3 - "//t.co/becgusy2i6" is corrected as "//t.co/becgusy2i6"
 4 - "//t.co/cneywn40x3" is corrected as "//t.co/cneywn40x3"
 5 - "fasttraffic" is corrected as "fasttraffic"


## 2.3 Research Paper

In [16]:
word_pairs = spell_ckecking(research_data_tockenize)

Mispelled count: 16
 0 - "�infantile�" is corrected as "infantile"
 1 - "task-specific" is corrected as "task-specific"
 2 - "luong" is corrected as "long"
 3 - "multi-task" is corrected as "multi-track"
 4 - "shared-private" is corrected as "shared-private"
 5 - "task-dependent" is corrected as "task-dependent"


# 3. Context Sensitive word correction

In [40]:
from symspellpy.symspellpy import SymSpell, Verbosity
word_length = 2
prefix_length = 7
sym_spell = SymSpell(word_length, prefix_length)
print("Corpus file not found") if not sym_spell.create_dictionary("../data/big.txt") else print("Success!")

Success!


In [68]:
preview = 10
def correct_tocknized_text(words):
    corr_count = 0
    corrected_words = []
    for i, word in enumerate(words[:-word_length+1]):
        word_set = [words[i+j] for j in range(word_length)]
        _input = ' '.join(word_set)
        result = sym_spell.word_segmentation(_input)
        correction = result.corrected_string
        if correction.lower() != _input.lower() and preview < corr_count:
            corr_count += 1
            print('"{}" is corrected as "{}"'.format(_input, correction))
        corrected_words.append(correction.split(' ')[0])
    corrected_words.append(correction.split(' ')[1])
    return corrected_words

## 3.1 Student Course Feedback

In [69]:
print(correct_tocknized_text(student_feedback_tockenize))

['Honestly', 'last', 'seven', 'lectures', 'are', 'good', 'a', 'Lectures', 'are', 'understandable', 'a', 'Lecture', 'sides', 'are', 'very', 'useful', 'to', 'self', 'also', 'a', 'The', 'given', 'opportunity', 'to', 'ask', 'questions', 'from', 'the', 'lecturer', 'is', 'appreciative', 'a', 'of', 'Good', 'a', 'a', 'a', 'br', 'a', 'a', 'please', 'do', 'reap', 'at', 'class', 'starting', 'it', 'a', 'a', '39', 'a', 's', 'better', 'for', 'us', 'a', 'a', 'br', 'a', 'a', 'sometimes', 'teaching', 'speed', 'is', 'very', 'high', 'a', 'a', 'br', 'a', 'a', 'a', 'br', 'a', 'a', 'Thanks', 'a', 'a', 'a', 'a', 'br', 'a', 'a', 'of', 'The', 'lectures', 'are', 'good', 'of', 'but', 'a', 'bit', 'speed', 'in', 'class', 'working', 'activity', 'is', 'a', 'must', 'ones', 'please', 'take', 'another', 'hour', 'in', 'thursdays', "madame'", "'", 'a', 'br', 'a', 'a', 'We', 'can', 'hear', 'your', 'voice', 'clearly', 'and', 'can', 'understand', 'the', 'things', 'you', 'teach', 'a', 'Presentation', 'sides', 'also', 'good',

## 3.2 Twitter Feed

In [66]:
print(correct_tocknized_text(twitter_data_tockenize))

"Immigration Fraudster" is corrected as "Immigration Frauds her"
"Fraudster Monsef" is corrected as "Frauds her Money"
"Monsef avoiding" is corrected as "Money avoiding"
"avoiding deportation" is corrected as "avoiding importation"
"deportation from" is corrected as "importation from"
"Canada ." is corrected as "Canada a"
". #" is corrected as "a a"
"# cdnpoli" is corrected as "a campo li"
"cdnpoli #" is corrected as "campo li a"
"# LPC" is corrected as "alps"
"LPC #" is corrected as "Lock"
"# CPCLDR��_" is corrected as "a Could R��_"
"CPCLDR��_ https" is corrected as "Could R��_ http"
"https :" is corrected as "http a"
": //t.co/ZOZOSe1CqQ" is corrected as "a it icon Loose 1 Can"
"//t.co/ZOZOSe1CqQ #" is corrected as "it icon Loose 1 Can a"
"# immigration" is corrected as "a immigration"
"immigration #" is corrected as "immigration a"
"# integration" is corrected as "a integration"
"integration #" is corrected as "integration a"
"# canada" is corrected as "a canada"
"canada https" is 

"And Crashes" is corrected as "And Clashes"
"Crashes In" is corrected as "Clashes In"
"Of Trump" is corrected as "Of Tramp"
"Trump #" is corrected as "Tramp a"
"# fasttraffic" is corrected as "a fast traffic"
"fasttraffic ," is corrected as "fast traffic a"
", #" is corrected as "a a"
"# sitetraffic" is corrected as "a site traffic"
"sitetraffic ," is corrected as "site traffic a"
", #" is corrected as "a a"
"# website" is corrected as "a web site"
"website ," is corrected as "web site a"
", #" is corrected as "a a"
"# traffic" is corrected as "a traffic"
"traffic https" is corrected as "traffic http"
"https :" is corrected as "http a"
": //t.co/zRlJ26jnkC" is corrected as "a it icon or 269 not"
"//t.co/zRlJ26jnkC Mr" is corrected as "it icon or 269 not Mr"
"Mr Know-all" is corrected as "Mr Know all"
"Know-all of" is corrected as "Know all of"
"Immigration https" is corrected as "Immigration http"
"https :" is corrected as "http a"
": //t.co/wTQK4QDiKI" is corrected as "a it cow Take W

## 3.3 Research Paper

In [67]:
print(correct_tocknized_text(research_data_tockenize))

"Neural network" is corrected as "Neutral network"
"for multi-task" is corrected as "for multi ask"
"multi-task learning" is corrected as "multi ask learning"
"learning ," is corrected as "learning a"
", which" is corrected as "a which"
"and task-invariant" is corrected as "and task in variant"
"task-invariant features" is corrected as "task in variant features"
"features ." is corrected as "features a"
". However" is corrected as "a However"
"However ," is corrected as "However a"
", in" is corrected as "a in"
"approaches ," is corrected as "approaches a"
", the" is corrected as "a the"
"by task-specific" is corrected as "by task specific"
"task-specific features" is corrected as "task specific features"
"tasks ." is corrected as "tasks a"
". In" is corrected as "a In"
"paper ," is corrected as "paper a"
", we" is corrected as "a we"
"an adversarial" is corrected as "an adversary l"
"adversarial multi-task" is corrected as "adversary l multi ask"
"multi-task learning" is corrected as 

# 4. Stemming

In [52]:
stemmer = PorterStemmer()
show_stemm = 10
def stem_tockens(tockenized_text):
    stemmed_text = []
    for word in tockenized_text:
        stemmed_text.append(stemmer.stem(word))
    print(tockenized_text[0:show_stemm])
    print(stemmed_text[0:show_stemm])
    return stemmed_text

## 4.1 Student Course Feedback

In [53]:
result = stem_tockens(student_feedback_tockenize)
write_to_file(result, 'stemmer/{}_output.txt'.format(TEXT.asn))

['Honestly', 'last', 'seven', 'lectures', 'are', 'good', '.', 'Lectures', 'are', 'understandable']
['honestli', 'last', 'seven', 'lectur', 'are', 'good', '.', 'lectur', 'are', 'understand']
Written to ../output/stemmer/assignment_output.txt


## 4.2 Twitter Feed

In [54]:
result = stem_tockens(twitter_data_tockenize)
write_to_file(result, 'stemmer/{}_output.txt'.format(TEXT.twt))

['Reminds', 'me', 'of', 'Liberal', 'Immigration', 'Fraudster', 'Monsef', 'avoiding', 'deportation', 'from']
['remind', 'me', 'of', 'liber', 'immigr', 'fraudster', 'monsef', 'avoid', 'deport', 'from']
Written to ../output/stemmer/twitter_output.txt


## 4.3 Research Paper

In [55]:
result = stem_tockens(research_data_tockenize)
write_to_file(result, 'stemmer/{}_output.txt'.format(TEXT.rsh))

['Neural', 'network', 'models', 'have', 'shown', 'their', 'promising', 'opportunities', 'for', 'multi-task']
['neural', 'network', 'model', 'have', 'shown', 'their', 'promis', 'opportun', 'for', 'multi-task']
Written to ../output/stemmer/research_output.txt


# 5. Lemmatization

In [56]:
lemmatizer = WordNetLemmatizer() 
show_lemm = 10
def lemmatize_tockens(tockenized_text):
    lemmatized_text = []
    for word in tockenized_text:
        lemmatized_text.append(lemmatizer.lemmatize(word))
    print(tockenized_text[0:show_lemm])    
    print(lemmatized_text[0:show_lemm])
    return lemmatized_text

## 4.1 Student Course Feedback

In [57]:
result = lemmatize_tockens(student_feedback_tockenize)
write_to_file(result, 'lemmatize/{}_output.txt'.format(TEXT.asn))

['Honestly', 'last', 'seven', 'lectures', 'are', 'good', '.', 'Lectures', 'are', 'understandable']
['Honestly', 'last', 'seven', 'lecture', 'are', 'good', '.', 'Lectures', 'are', 'understandable']
Written to ../output/lemmatize/assignment_output.txt


## 4.2 Twitter Feed

In [58]:
result = lemmatize_tockens(twitter_data_tockenize)
write_to_file(result, 'lemmatize/{}_output.txt'.format(TEXT.twt))

['Reminds', 'me', 'of', 'Liberal', 'Immigration', 'Fraudster', 'Monsef', 'avoiding', 'deportation', 'from']
['Reminds', 'me', 'of', 'Liberal', 'Immigration', 'Fraudster', 'Monsef', 'avoiding', 'deportation', 'from']
Written to ../output/lemmatize/twitter_output.txt


## 4.3 Research Paper

In [59]:
result = lemmatize_tockens(research_data_tockenize)
write_to_file(result, 'lemmatize/{}_output.txt'.format(TEXT.rsh))

['Neural', 'network', 'models', 'have', 'shown', 'their', 'promising', 'opportunities', 'for', 'multi-task']
['Neural', 'network', 'model', 'have', 'shown', 'their', 'promising', 'opportunity', 'for', 'multi-task']
Written to ../output/lemmatize/research_output.txt
