In [2]:
import pandas as pd
from pandas import DataFrame as df

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim
from gensim.models import Word2Vec #@
from gensim.utils import simple_preprocess #@
from gensim.models.keyedvectors import KeyedVectors #@

#Keras
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model #모델 저장

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Utility
import re
import numpy as np
import time #수행시간 측정




In [3]:
# =============== 셋팅 =============== #

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

#전처리
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

In [4]:
#학습데이터 로드
dataset=pd.read_csv('./train.csv',encoding = DATASET_ENCODING, names=DATASET_COLUMNS)
print(dataset.shape) #1600000,6

(1600000, 6)


In [5]:
dataset.head() # negative:0, positive:4

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
#학습 데이터 텍스트 전처리
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [7]:
dataset.text = dataset.text.apply(lambda x: preprocess(x)) #전처리 진행

In [8]:
train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(train))
print("TEST size:", len(test))

TRAIN size: 1280000
TEST size: 320000


In [9]:
vocab_size = 400000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(train.text)
max_len=50

In [10]:
model = load_model('./text-CNN.h5')

In [11]:
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE

In [23]:
def predict(ex_text, include_neutral=True):
    start_at = time.time()
    x_encoded = tk.texts_to_sequences([ex_text])
    res_test=np.array(pad_sequences(x_encoded, maxlen=max_len, padding='post'))
    # Predict
    score = model.predict([res_test])
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    
    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}

In [None]:
predict("I love it")

In [12]:
df_res = pd.DataFrame({'text':[],
                   'label':[],
                   'score':[],
                  'elapsed_time':[]}) #결과 dataframe 생성

In [13]:
def makeRes(samples):
  for col,item in samples.iterrows():
    res=predict(item[0])
    df_res.loc[col]=[ item[0], res['label'], res['score'],res['elapsed_time'] ]
  return df_res

In [31]:
from flask import Flask, jsonify, render_template, request
from werkzeug.utils import secure_filename


In [32]:
from flask_cors import CORS

In [33]:
app = Flask(__name__)
CORS(app)

<flask_cors.extension.CORS at 0x2958788fcc8>

In [34]:
#업로드 html 렌더링
@app.route('/')
def render_file():
    return render_template('file.html')

In [35]:
def predict_sentiment(sample):
    list_text=[]
    list_label=[]
    list_score=[]

    for col,item in sample.iterrows():
        res=predict(item[0])
        list_text.append(item[0])
        list_label.append(res['label'])
        list_score.append(res['score'])

    dic = {'text':list_text,'label':list_label,'score':list_score}
    return dic

In [36]:
#파일 업로드 처리 + node 서버로 전달
@app.route('/fileUploaded',methods=['POST'])
def upload_file():
    if request.method=='POST':
        f = request.files['file']
        print(f)
        f.save(secure_filename(f.filename))
        print("TEST")
        sample=pd.read_csv(f.filename, sep = "\n",encoding = "utf8",header=None)
        predict = predict_sentiment(sample)
    return predict_sentiment(sample) 

In [43]:
@app.route('/info')
def dataInfo2():
    data = {"name":"test","account":"@realDonald"}
    return data

In [37]:
import json
def toJson(df):
    tmp = df.to_json(orient='records')
    parsed = json.loads(tmp)
    res = json.dumps(parsed['data'])  
    return res

In [None]:
new_json={} #사전 생성

In [None]:
s2=pd.read_csv('sample.txt',sep = "\n",encoding = "utf8",header=None)
s2

In [None]:
a = [1,2,3,4]
b = ['a','bc']
dic = {'key':a,'b':b}

In [None]:
type(dic)

In [None]:
new_json

In [None]:
res['label']

In [None]:
res = df_res.to_json(orient='records')


In [None]:
parsed[0]['text']

In [None]:
type(res)

In [None]:
parsed = json.loads(res)
parsed

In [None]:
parsed

In [None]:
res=json.dumps(parsed)  

In [40]:
app.config['JSON_AS_ASCII'] = False
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
if __name__ =="__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off
 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Nov/2020 16:19:37] "[37mGET /info HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 16:19:38] "[37mGET /info HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 16:19:38] "[37mGET /info HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 16:20:13] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 16:31:34] "[37mGET /info HTTP/1.1[0m" 200 -
127.0.0.1 - - [03/Nov/2020 16:31:53] "[37mGET /info HTTP/1.1[0m" 200 -
