In [1]:
import nltk #.. Import Natural Language ToolKit To work with human language data.
from nltk.corpus import stopwords #.. Help To Remove stop words like ‘the’, ‘is’, ‘are’. 
from nltk.tokenize import word_tokenize #.. Help To Divides a String Into Substrings. 
import pandas as pd #.. providing high-performance data analysis tools.
from pandas import DataFrame #.. potentially heterogeneous tabular data structure with labeled axes (rows and columns).
import itertools 
##
#.. implements a number of iterator building blocks inspired by constructs from APL, Haskell, and SML.
##
import numpy as np #..  fundamental package for scientific computing with Python.
import re #.. provides regular expression matching operations.
from nltk.corpus import inaugural
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.neural_network import MLPClassifier

##
# for interface
##
from tkinter import *
import tkinter.messagebox



In [2]:
df = pd.read_csv('testt-data.csv')

# PreProcessing

In [3]:
def normalize(df):
    lst = []
    for x in range(len(df)):
        text = re.sub(r"[,.'!?]",'', df[x])
        lst.append(text)
    filtered_sentence = ' '.join(lst)
    return filtered_sentence

In [4]:
def stopWordRemove(df):
    stop = stopwords.words("english")
    needed_words = []
    for x in range(len(df)):
        
        words = word_tokenize(df)
        for word in words:
            if word not in stop:
                needed_words.append(word)
    return needed_words

In [5]:
def prepareDataSets(df):
    sentences = []
    for index, d in df.iterrows():
        Definitions = stopWordRemove(d['Definitions'].lower())
        Definitions_normalized = normalize(Definitions)
        if d['Results'] == 'T':
            sentences.append([Definitions, "ML"]) ## is mean ture and answer blong to machine learning
        elif d['Results'] == 'T1':
            sentences.append([Definitions, "DL"])
        elif d['Results'] == 'T2':
             sentences.append([Definitions, "DM"])
        elif d['Results'] == 'T3':
             sentences.append([Definitions, "NN"])
        elif d['Results'] == 'T4':
             sentences.append([Definitions, "MV"])    
        elif d['Results'] == 'T5':
             sentences.append([Definitions, "FL"]) 
        elif d['Results'] == 'T6':
             sentences.append([Definitions, "NL"])                 
        else:
            sentences.append([Definitions, 'false']) ##mean is just fucking false
    df_sentences = DataFrame(sentences, columns=['Definitions', 'Results[0]'])
    for x in range(len(df_sentences)):
        df_sentences['Definitions'][x] = ' '.join(df_sentences['Definitions'][x])
    return df_sentences


In [6]:
prepareDataSets(df)

Unnamed: 0,Definitions,Results[0]
0,machine learning field computer science gives ...,ML
1,machine learning explores study construction a...,ML
2,machine learning computer program learn experi...,ML
3,machine learning method used devise complex mo...,ML
4,machine learning artificial intelligence ( ai ...,ML
5,machine learning method data analysis automate...,ML
6,machine learning science getting computers lea...,ML
7,machine learning field computer science gives ...,ML
8,machine learning ( ml ) type artificial intell...,ML
9,machine learning application artificial intell...,ML


In [7]:
preprocessed_df = prepareDataSets(df)

# FeatureExtraction

In [8]:
def featureExtraction(data):
    vectorizer = TfidfVectorizer(min_df=10, max_df=0.50, ngram_range=(1,3))
    tfidf_data = vectorizer.fit_transform(data)
    return vectorizer, tfidf_data

# Learning&Prediction

In [9]:
def learning(clf, X, Y):
    X_train, X_test,  Y_train, Y_test = \
    cross_validation.train_test_split(X,Y, test_size=.2,random_state=43)
    classifier = clf()
    classifier.fit(X_train, Y_train)
    predict = cross_validation.cross_val_predict(classifier, X_test, Y_test, cv=10)
    scores = cross_validation.cross_val_score(classifier, X_test, Y_test, cv=10)
    print(scores)
    print ("Accuracy of %s: %0.2f(+/- %0.2f)" % (classifier, scores.mean(), scores.std() *2))
    print (classification_report(Y_test, predict))

In [10]:
def main(df, clf):
    df = pd.read_csv('testt-data.csv')
    preprocessed_df = prepareDataSets(df)
    data, target = preprocessed_df['Definitions'], preprocessed_df['Results']
    tfidf_data = featureExtraction(data)
    learning(clf, tfidf_data, target)

In [11]:
clfs = [MultinomialNB,BernoulliNB,SVC,LinearSVC]

In [12]:
test = LinearSVC()
data, target = preprocessed_df['Definitions'], preprocessed_df['Results[0]']
tfidf_vectorizer, tfidf_data = featureExtraction(data)
X_train, X_test,  Y_train, Y_test = \
cross_validation.train_test_split(tfidf_data,target, test_size=.2,random_state=43)
test.fit(tfidf_data, target)
predict = cross_validation.cross_val_predict(test, X_test, Y_test, cv=10)
scores = cross_validation.cross_val_score(test, X_test, Y_test, cv=10)
print(scores)
print ("Accuracy of %s: %0.2f(+/- %0.2f)" % (test, scores.mean(), scores.std() *2))
print (classification_report(Y_test, predict))




[0.66666667 0.875      0.875      0.83333333 0.66666667 1.
 1.         1.         0.66666667 1.        ]
Accuracy of LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0): 0.86(+/- 0.28)
             precision    recall  f1-score   support

         DL       0.78      0.78      0.78         9
         DM       1.00      1.00      1.00         7
         FL       1.00      1.00      1.00         6
         ML       0.75      0.60      0.67         5
         MV       0.00      0.00      0.00         1
         NL       1.00      1.00      1.00         3
         NN       0.50      0.33      0.40         3
      false       0.77      0.89      0.83        19

avg / total       0.81      0.83      0.82        53



  'precision', 'predicted', average, warn_for)


In [13]:
Xnew = tfidf_data
ynew = test.predict(Xnew)
print(ynew)

['ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML'
 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'ML' 'false' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false'
 'ML' 'false' 'false' 'false' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL'
 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'DL' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'DM' 'DM'
 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM' 'DM'
 'DM' 'DM' 'DM' 'DM' 'DM' 'false' 'false' 'false' 'false' 'false' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false'
 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false' 'false'
 'false' 'NN' 'NN' 'NN' 'NN' 'NN' 'NN' 'NN' 'false

In [14]:
def run_program(answer):
    Xnew = [answer]
    tvect = TfidfVectorizer(min_df=10, max_df=0.1, ngram_range=(1,3))
    X_test= tfidf_vectorizer.transform(Xnew)
    ynew = test.predict(X_test)
    return ynew

# Interface

In [16]:
root = Tk()
root.geometry('500x500')
root.title('Answer Correction')
root.configure(background="silver")

textin = StringVar
mystring = StringVar()

questions = [
    'What\'s the definition of \'Machine Learning\' ?',
    'What\'s the definition of \'Deep Learning\' ?',
    'What\'s the definition of \'Neural Network\' ?',
    'What\'s the definition of \'Machine Vision\' ?',
    'What\'s the definition of \'Fuzzy Logic\' ?',
    'What\'s the definition of \'Natural Language\' ?',
    'What\'s the definition of \'Data Mining\' ?',
    ]

count=0
txtin = questions[count]

def onSubmit():
    global count
    global txtin
    
    ##function => to check if text valid
    entered = txt.get("1.0","end-1c")
    response = chk_if_text_valid(entered) # return true or false
    
    output.delete(0.0,END)

    if response == False:
        tkinter.messagebox.showerror('Error', 'Your Answer must be more than 50 letter')
    else:
        rr = run_program(entered)
        output.insert(0.0,rr)
        count = count +1
    
        if count > 6:
            tkinter.messagebox.showinfo('Finished', 'The Exam is finished')
            exit()
        ##new question
        txtin = questions[count]
        label.config(text="Question: "+txtin)
    
## validation of length of text
def chk_if_text_valid(str):

    if len(str) >= 50:
        return True
    else:
        return False

label_0 = Label(root, text="Answer Correction", width=20, font=("bold", 20))
label_0.place(x=90, y=10)

label = Label(root, bg="red", text="Question: "+txtin, width=50, font=("bold", 10))
label.place(x=50, y=53)

label_1= Label(root, text="Write Your Answer (MUST answer more than 50 letter: ", width=50, font=('bold', 10))
label_1.place(x=50,y=130)

txt=Text(root, width=50, height=8, font=("none 12"), bg='white')
txt.place(x=20, y=160)

Button(root, text="Submit",width=20,bg="brown",fg="white",command=onSubmit).place(x=180, y=330)

output = Text(root, width=20, height=8, font=('Time 20 bold'), fg="black")
output.place(x=100, y=400)

mainloop()