## [재난예측기 - tweets 데이터 ]

### 방법1 :  파이썬 코딩( tfidf 직접 구하기 )

### < 기계학습 준비하기 > 
#### 1. 라이브러리 가져오기

In [1]:
import pandas as pd
import numpy as np
import nltk
import re

#### 2-1. 데이터셋 준비하기

In [3]:
df_raw = pd.read_csv('[Dataset]_Module25_disasters_social_media.csv',encoding='ISO-8859-1')

df_choice_one = df_raw[df_raw['choose_one'] != "Can't Decide"]
df = df_choice_one[['text', 'choose_one']].copy()
relevance = {'Relevant':1,'Not Relevant':0}
df['relevance'] = df.choose_one.map(relevance) 
df.head(3)

Unnamed: 0,text,choose_one,relevance
0,Just happened a terrible car crash,Relevant,1
1,Our Deeds are the Reason of this #earthquake M...,Relevant,1
2,"Heard about #earthquake is different cities, s...",Relevant,1


### 2-2. 데이터 전처리하기

In [4]:
# 1. 텍스트에서 단어를 추출 : extract_words(sentence) 
def extract_words(sentence):
    '''This is to clean and tokenize words'''
    # 특수 문자를 공백으로 바꿉니다.
    ignore_words = ['a', 'the', 'if', 'br', 'and', 'of', 'to', 'is', 'are', 'he', 'she', 'my', 'you', 'it','how']
    words = re.sub("[^\w]", " ",  sentence).split() # 모든 특수 문자를 ' '로 대체합니다.
    words = [word.lower() for word in words]   
    words_cleaned = [w.lower() for w in words if w not in ignore_words]   
    return words_cleaned 
    
# 2. 단어의 빈도를 계산합니다. 
def map_book(hash_map, tokens):
    if tokens is not None:
        for word in tokens:
            # 단어가 존재합니까?
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1
        return hash_map
    else:
        return None

# 3.각 단어의 발생 빈도를 계산한 값으로 hash map을 만듭니다. = > 2 단계 호출
def make_hash_map(df):
    # 해시맵을 생성합니다.
    hash_map = {}
    
    # DataFrame의 각 행에 대해 반복합니다.
    for index, row in df.iterrows():
        # 단어의 빈도를 계산합니다.
        hash_map = map_book(hash_map, extract_words(row['text']))
    
    # 해시맵을 반환합니다.
    return hash_map

#토큰 화 된 데이터 세트에서 해시 맵 (단어 및 빈도) 생성
hash_map = make_hash_map(df)   
hash_map

{'just': 453,
 'happened': 31,
 'terrible': 14,
 'car': 133,
 'crash': 166,
 'our': 150,
 'deeds': 2,
 'reason': 29,
 'this': 702,
 'earthquake': 72,
 'may': 113,
 'allah': 11,
 'forgive': 4,
 'us': 176,
 'all': 381,
 'heard': 56,
 'about': 312,
 'different': 17,
 'cities': 11,
 'stay': 40,
 'safe': 15,
 'everyone': 73,
 'there': 264,
 'forest': 105,
 'fire': 363,
 'at': 745,
 'spot': 30,
 'pond': 7,
 'geese': 1,
 'fleeing': 3,
 'across': 27,
 'street': 37,
 'i': 2486,
 'cannot': 15,
 'save': 57,
 'them': 165,
 'near': 84,
 'la': 25,
 'ronge': 1,
 'sask': 1,
 'canada': 17,
 'residents': 13,
 'asked': 14,
 'shelter': 9,
 'in': 2805,
 'place': 34,
 'being': 121,
 'notified': 1,
 'by': 768,
 'officers': 11,
 'no': 400,
 'other': 66,
 'evacuation': 67,
 'or': 294,
 'orders': 12,
 'expected': 18,
 '13': 33,
 '000': 6,
 'people': 282,
 'receive': 3,
 'wildfires': 16,
 'california': 159,
 'got': 160,
 'sent': 14,
 'photo': 64,
 'from': 613,
 'ruby': 2,
 'alaska': 9,
 'as': 485,
 'smoke': 66,


In [5]:
# 4.frequent_vocab 함수를 다음과 같은 입력으로 정의하십시오 : word_freq 및 max_features
def frequent_vocab(word_freq, max_features): 
    counter = 0  # 값 0으로 카운터를 초기화하십시오
    vocab = []   # Vocab이라는 빈 목록을 만듭니다
    # 단어를 빈도수가 낮은 순서로 사전에 나열합니다
    for key, value in sorted(word_freq.items(), key=lambda item: (item[1], item[0]), reverse=True): 
       # 상위(max_features) 단어 수를 얻기 위한 루프 함수
        if counter<max_features: 
            vocab.append(key)
            counter+=1
        else: break
    return vocab

# 빈도수 높은 상위 500개 단어 선정     
vocab = frequent_vocab(hash_map, 500)  
vocab[:10]

['t', 'co', 'http', 'in', 'i', 'ã', 's', 'for', 'on', 'that']

In [6]:
# 5.다음과 같은 입력으로 함수 bagofwords를 정의: sentence, words
def bagofwords(sentence, words):
    sentence_words = extract_words(sentence) # 문장/트윗을 토큰화하고 변수 sentence_words에 할당
    # 빈도 단어 수
    bag = np.zeros(len(words)) # 크기가 len(words)이고 0으로 구성된 NumPy 배열 생성
    # 트윗에 토큰이 있을 때 데이터를 반복하고 1의 값을 추가
    for sw in sentence_words:
        for i,word in enumerate(words):
            if word == sw: 
                bag[i] += 1
                
    return np.array(bag) # 하나의 트윗에 대한 단어 가방 반환

# 6. bow 생성 : df['text'] 에 대해 단어 가방을 포함하도록 지정된 차원이있는 숫자 배열을 설정합니다.
n_words = len(vocab)
n_docs = len(df)
bag_o = np.zeros([n_docs,n_words])
# 루프 함수를 사용하여 각 트윗에 대해 새 행을 추가합니다.
for ii in range(n_docs): 
    # 이전 함수 'bagofwords'를 호출합니다. 입력을 참조하십시오 : sentence, words
    bag_o[ii,:] = bagofwords(df['text'].iloc[ii], vocab) 

bag_o[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
# 7.idf 구하기
# 트윗 수(numdocs)와 토큰/워드 수(numwords)를 나타내는 변수 2개 초기화
numdocs, numwords = np.shape(bag_o)
# 위와 같이 TFIDF 수식으로 변경
N = numdocs
word_frequency = np.empty(numwords)
# 단어가 나타나는 문서 수를 계산
for word in range(numwords):
    word_frequency[word]=np.sum((bag_o[:,word]>0)) 

idf = np.log(N/word_frequency)
idf[:5]

array([0.56037575, 0.64356806, 0.74842242, 1.45884525, 1.73955499])

In [8]:
# 8.tf-idf 구하기
# 초기화 tfidf 배열
tfidf = np.empty([numdocs, numwords])

# 트윗에서 반복, 용어 빈도(단어 가방으로 표시)를 idf로 곱합니다.
for doc in range(numdocs):
    tfidf[doc, :]=bag_o[doc, :]*idf

tfidf[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### 3. 학습용/테스트용 데이터셋 분리하기

In [13]:
# X_all과 y_all을 교육 및 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split( tfidf , df['relevance'].values, shuffle=True)

### < 기계학습하기 >
#### 1. 라이브러리 가져오기

In [10]:
from sklearn.linear_model import LogisticRegression #로지스틱 회귀 모형 가져오기
from sklearn.model_selection import train_test_split # 데이터를 훈련 및 테스트 세트로 분할
from sklearn.model_selection import GridSearchCV # 모델의 가장 적합한 매개 변수를 찾기 위해

#### 2. 모델 생성하고 학습하기

In [11]:
# 모델 인스턴스(instance) 작성
logreg = LogisticRegression(solver = 'liblinear')

#### 3. 데이터에 대한 모델 훈련, 데이터로부터 배운 정보 저장

In [14]:
# 학습 세트에 모델 적합
logreg.fit(X_train,y_train)

#### 4. 모델을 사용하여 테스트 데이터를 기반으로 관련성 예측 및 정확도

In [16]:
y_pred = logreg.predict(X_test)
print (y_pred)

[0 1 0 ... 1 0 0]


In [17]:
# 스코어 방법을 사용하여 모델의 정확성을 얻습니다
score = logreg.score(X_test, y_test)
print(score)
print('Accuracy of logistic regression classifier on test set: {:.3f}'.format(score))

0.7686924493554328
Accuracy of logistic regression classifier on test set: 0.769


### 5. 재난 예측기로 테스트 하기

In [20]:
def twitter_predictor(tweet):
    # your code here - 코드 완성됨
    word_vector = bagofwords(tweet, vocab) # 단어 가방 변수를 설정합니다.bagofwords 함수를 기억하십니까?
    word_tfidf = word_vector*idf # tfidf값 찾기
    prediction = logreg.predict(word_tfidf.reshape(1, -1)) # 트윗이 자연재해와 관련이 있는지 없는지 예측
    results = {1:'Relevant', 0:'Not Relevant'} # 잠재적인 결과를 포함하는 집합을 만듭니다."Relevant" 및 "Not relevant" 태그를 변경할 수 있습니다.
    #print(results[int(prediction)])  : prediction은 1D 배열이며, 이를 직접 int() 함수에 전달하려고 하면 경고
    print(results[int(prediction[0])])

In [80]:
# your code here
tweet1 = '200 houses were on fire after an electric spark burns the stack of wood'
tweet2 = 'Michael curry is on the roll as he scored the fifth goal on the football tournament.'
tweet3 = 'Michael curry is on fire as he scored the fifth goal on the football tournament.'
tweet4 = "Natural disasters can cause significant damage to homes, infrastructure, and the environment, leading to loss of life and displacement of communities."

twitter_predictor(tweet1)
twitter_predictor(tweet2)
twitter_predictor(tweet3)
twitter_predictor(tweet4)

Relevant
Not Relevant
Not Relevant
Relevant


### 방법2 : 라이브러리 사용하여 재난예측기 만들기

In [22]:
# khs code
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

df_raw = pd.read_csv('[Dataset]_Module25_disasters_social_media.csv',encoding='ISO-8859-1')

df_choice_one = df_raw[df_raw['choose_one'] != "Can't Decide"]
df = df_choice_one[['text', 'choose_one']].copy()
df['relevance'] = df['choose_one'].replace({'Relevant' : 1, 'Not Relevant' : 0})
# relevance = {'Relevant':1,'Not Relevant':0}
# df['relevance'] = df.choose_one.map(relevance) 

X_all = df["text"]
y_all = df['relevance'].values

X_train, X_test, y_train, y_test = train_test_split( X_all , y_all, shuffle=True)

vectorizer = CountVectorizer(analyzer = "word", strip_accents=None, tokenizer = None, \
                             preprocessor = None, stop_words = None, max_features = 5000) 
train_data_features = vectorizer.fit_transform(X_train)
test_data_features = vectorizer.transform(X_test)

tfidfier = TfidfTransformer()
tfidf = tfidfier.fit_transform(train_data_features)
tfidf_test = tfidfier.transform(test_data_features)

tfidf_X_train = tfidf.toarray()
tfidf_X_test = tfidf_test.toarray()

def classify():
    rf = LogisticRegression()
    rf.fit(tfidf_X_train, y_train)
    print(rf.score(tfidf_X_test, y_test))
    print(rf.predict(tfidf_X_test))
    return rf

model = classify()
y_pred = model.predict(tfidf_X_test)
print(f"Accuracy : {((model.score(tfidf_X_test,y_test))*100).round(2)} %")

def twitter_predictor(model, input_sentence):
    # CountVectorizer와 TfidfTransformer를 이용하여 입력 문장 변환
    input_data_features = vectorizer.transform([input_sentence])
    input_tfidf = tfidfier.transform(input_data_features)

    # rf = classify()
    # 변환된 입력을 모델에 적용하여 감정 예측
    prediction = model.predict(input_tfidf)
    results = {1: 'Relevant', 0: 'Not Relevant'} 

    # 결과 출력
    print(f' "{input_sentence}" is predicted as : {results[prediction[0]]}' )

test_sentence =[
                "200 houses were on fire after an electric spark burns the stack of wood",
                "Michael curry is on the roll as he scored the fifth goal on the football tournament.",
                "Michael curry is on fire as he scored the fifth goal on the football tournament.",
                "Natural disasters can cause significant damage to homes, infrastructure, and the environment, leading to loss of life and displacement of communities."]

for test in test_sentence : 
    twitter_predictor(model, test)

0.8084714548802947
[0 0 0 ... 0 0 1]
Accuracy : 80.85 %
 "200 houses were on fire after an electric spark burns the stack of wood" is predicted as : Relevant
 "Michael curry is on the roll as he scored the fifth goal on the football tournament." is predicted as : Not Relevant
 "Michael curry is on fire as he scored the fifth goal on the football tournament." is predicted as : Not Relevant
 "Natural disasters can cause significant damage to homes, infrastructure, and the environment, leading to loss of life and displacement of communities." is predicted as : Relevant


## [감정분석 예측기 - 영화 데이터 ]

### 방법1 : 라이브러리 사용하여 예측기 만들기

### 1. 라이브러리가져오기

In [24]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer # 이 기능은 단어 가방을 만들 수 있도록 도와줍니다.
from sklearn.feature_extraction.text import TfidfTransformer # 이 기능은 단어 가방을 자동으로 정규화합니다.
from sklearn.linear_model import LogisticRegression #로지스틱 회귀 모형 가져오기
from sklearn.model_selection import train_test_split # 데이터를 훈련 및 테스트 세트로 분할
from sklearn.model_selection import GridSearchCV # 모델의 가장 적합한 매개 변수를 찾기 위해

### 2. 데이터 읽어 저장하기

In [25]:
df_raw = pd.read_pickle('[Dataset]_Module25_df_raw.pkl')
df_raw_test = pd.read_pickle('[Dataset]_Module25_df_raw_test.pkl')

### 3.BOW 생성하기 - CountVerorize()사용

In [26]:
vectorizer = CountVectorizer(analyzer = "word", strip_accents=None, tokenizer = None, \
                             preprocessor = None, stop_words = None, max_features = 5000) 
train_data_features = vectorizer.fit_transform(df_raw['text'])
test_data_features = vectorizer.transform(df_raw_test['text'])

### 4. BOW 정규화하기 - TfidfTransformer() => .fit_transform() 메서드를 사용

In [27]:
tfidfier = TfidfTransformer()
tfidf = tfidfier.fit_transform(train_data_features)
tfidf_test = tfidfier.transform(test_data_features)

### 5. 학습/테스트 데이터셋 정의하기

In [28]:
X_all = tfidf.toarray()
y_all = df_raw['positive'].values
X_test = tfidf_test.toarray()
y_test = df_raw_test['positive'].values

### 6. 분류기 만들어 예측하기

In [29]:
def classify():
    rf = LogisticRegression()
    rf.fit(X_all,y_all)
    print(rf.score(X_test,y_test))
    print(rf.predict(X_test))
    return rf

classify()

0.88252
[1 1 1 ... 1 1 1]


In [30]:
def sentiment_prediction(input_sentence):
    # CountVectorizer와 TfidfTransformer를 이용하여 입력 문장 변환
    input_data_features = vectorizer.transform([input_sentence])
    input_tfidf = tfidfier.transform(input_data_features)

    rf = classify()
    # 변환된 입력을 모델에 적용하여 감정 예측
    prediction = rf.predict(input_tfidf)
    results = {1: 'Positive', 0: 'Negative'} 

    # 결과 출력
    print('The sentence is predicted as:', results[prediction[0]])

test_sentence = "The movie was absolutely fantastic! The storyline was captivating, the acting was superb, and the visuals were stunning."
sentiment_prediction(test_sentence)

0.88252
[1 1 1 ... 1 1 1]
The sentence is predicted as: Positive


### 방법2 : 파이썬 프로그래밍으로 예측기 만들기 ( tfidf 구하기 )

In [161]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer # 이 기능은 단어 가방을 만들 수 있도록 도와줍니다.
from sklearn.feature_extraction.text import TfidfTransformer # 이 기능은 단어 가방을 자동으로 정규화합니다.
from sklearn.linear_model import LogisticRegression #로지스틱 회귀 모형 가져오기
from sklearn.model_selection import train_test_split # 데이터를 훈련 및 테스트 세트로 분할

In [32]:
df_raw = pd.read_pickle('[Dataset]_Module25_df_raw.pkl')
df_raw_test = pd.read_pickle('[Dataset]_Module25_df_raw_test.pkl')
df_all = pd.concat([df_raw, df_raw_test])

In [34]:
df = df_all[['text','positive']]

In [35]:
def extract_words(sentence):
    '''This is to clean and tokenize words'''
    # 특수 문자를 공백으로 바꿉니다.
    ignore_words = ['a', 'the', 'if', 'br', 'and', 'of', 'to', 'is', 'are', 'he', 'she', 'my', 'you', 'it','how']
    words = re.sub("[^\w]", " ",  sentence).split() # 모든 특수 문자를 ' '로 대체합니다.
    words = [word.lower() for word in words]
    words_cleaned = [w.lower() for w in words if w not in ignore_words]
    return words_cleaned 

In [36]:
# khs code : make_hash_map(df) 내에 map_book(hash_map, tokens) 포함
def make_hash_map(df):
    # Create a hash map.
    hash_map = {}

    # Iterate over each row in the DataFrame.
    for index, row in df.iterrows():
    # 문장에서 단어들을 추출한다.
        words = extract_words(row['text'])

        # hash map에 단어를 추가한다
        for word in words:
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1
    return hash_map

make_hash_map(df)

{'bromwell': 8,
 'high': 4342,
 'cartoon': 1099,
 'comedy': 6576,
 'ran': 482,
 'at': 46800,
 'same': 8096,
 'time': 25110,
 'as': 91750,
 'some': 31134,
 'other': 18274,
 'programs': 117,
 'about': 34160,
 'school': 3535,
 'life': 12917,
 'such': 10005,
 'teachers': 159,
 '35': 184,
 'years': 8759,
 'in': 186781,
 'teaching': 201,
 'profession': 133,
 'lead': 2696,
 'me': 21457,
 'believe': 4991,
 'that': 143879,
 's': 125008,
 'satire': 495,
 'much': 19318,
 'closer': 379,
 'reality': 2010,
 'than': 19330,
 'scramble': 10,
 'survive': 546,
 'financially': 58,
 'insightful': 141,
 'students': 749,
 'who': 42234,
 'can': 29059,
 'see': 23029,
 'right': 6529,
 'through': 9692,
 'their': 22750,
 'pathetic': 987,
 'pomp': 14,
 'pettiness': 6,
 'whole': 6122,
 'situation': 1384,
 'all': 46947,
 'remind': 285,
 'schools': 140,
 'i': 175633,
 'knew': 1826,
 'when': 28062,
 'saw': 6341,
 'episode': 3183,
 'which': 23402,
 'student': 807,
 'repeatedly': 267,
 'tried': 1589,
 'burn': 253,
 'dow

In [37]:
# frequent_vocab 함수를 다음과 같은 입력으로 정의하십시오 : word_freq 및 max_features
def frequent_vocab(word_freq, max_features): 
    counter = 0  # 값 0으로 카운터를 초기화하십시오
    vocab = []   # Vocab이라는 빈 목록을 만듭니다
    # 단어를 빈도수가 낮은 순서로 사전에 나열합니다
    for key, value in sorted(word_freq.items(), key= lambda item: (item[1], item[0]), reverse=True): 
       # 상위(max_features) 단어 수를 얻기 위한 루프 함수
        if counter < max_features: 
            vocab.append(key)
            counter += 1
        else: break
    return vocab

In [38]:
# 단어 가방을 포함하도록 지정된 차원이있는 숫자 배열을 설정합니다.
hash_map = make_hash_map(df)
vocab = frequent_vocab(hash_map, 500)

In [39]:
# 다음과 같은 입력으로 함수 bagofwords를 정의 - 벡터화: sentence, words
def bagofwords(sentence, words):
    sentence_words = extract_words(sentence) # 문장/트윗을 토큰화하고 변수 sentence_words에 할당
    # 빈도 단어 수
    bag = np.zeros(len(words)) # 크기가 len(words)이고 0으로 구성된 NumPy 배열 생성
    # 트윗에 토큰이 있을 때 데이터를 반복하고 1의 값을 추가
    for sw in sentence_words:
        for i,word in enumerate(words):
            if word == sw: 
                bag[i] += 1
                
    return np.array(bag) # 하나의 평가에 대한 단어 가방 반환

n_words = len(vocab)
n_docs = len(df)
bag_o = np.zeros([n_docs,n_words])
# 루프 함수를 사용하여 각 트윗에 대해 새 행을 추가합니다.
for ii in range(n_docs): 
    # 이전 함수 'bagofwords'를 호출합니다. 입력을 참조하십시오 : sentence, words
    bag_o[ii,:] = bagofwords(df['text'].iloc[ii], vocab)
    
# your code here
bag_o.shape

(50000, 500)

In [40]:
# 트윗 수(numdocs)와 토큰/워드 수(numwords)를 나타내는 변수 2개 초기화 (10860, 500)
numdocs, numwords = np.shape(bag_o)
# 위와 같이 TFIDF 수식으로 변경
N = numdocs
word_frequency = np.empty(numwords)

# 단어가 나타나는 문서 수를 계산
for word in range(numwords):
    word_frequency[word]=np.sum((bag_o[:,word]>0)) # 각 행(트윗수)에 word드가 포함된 수 함께

idf = np.log(N/word_frequency)

# 초기화 tfidf 배열
tfidf = np.empty([numdocs, numwords])

# 트윗에서 반복, 용어 빈도(단어 가방으로 표시)를 idf로 곱합니다.
for doc in range(numdocs):    
    tfidf[doc, :] = bag_o[doc, :]*idf

tfidf[:10]

array([[0.25521225, 1.13990115, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25521225, 0.22798023, 0.09884843, ..., 0.        , 0.        ,
        0.        ],
       [0.25521225, 0.68394069, 0.09884843, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.63803062, 0.        , 0.19769686, ..., 0.        , 0.        ,
        0.        ],
       [0.76563675, 0.22798023, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.63803062, 1.36788138, 0.88963589, ..., 0.        , 0.        ,
        0.        ]])

In [41]:
def classify(rf, X_all, y_all): # 훈련되지 않은 모델, tfidf 배열 및 훈련 대상의 값을 가져온다.
    X_train,X_test,y_train,y_test = train_test_split(X_all,y_all,shuffle=True) # 무작위로 두 개를 나누어 학습과 테스트 세트를 만든다.
    rf.fit(X_train,y_train) # 학습 세트에 모델을 맞춘다.
    print(rf.score(X_test,y_test)) # 테스트 세트에 정확도 점수를 출력한다.
    return rf # 훈련된 모델을 반환한다.

In [42]:
logreg = LogisticRegression(solver = 'newton-cg')

X_all = tfidf
y_all = df['positive'].values
logreg = classify(logreg, X_all, y_all)

0.84184


In [183]:
def sentiment_predictor(sentiment):
    # your code here
    word_vector = bagofwords(sentiment, vocab) # 단어 가방 변수를 설정합니다.bagofwords 함수를 기억하십니까?
    word_tfidf = word_vector*idf # tfidf값 찾기
       
    prediction = logreg.predict(word_tfidf.reshape(1, -1)) # 트윗이 자연재해와 관련이 있는지 없는지 예측
    results = {1:'positive', 0:'negative'} # 잠재적인 결과를 포함하는 집합을 만듭니다."Relevant" 및 "Not relevant" 태그를 변경할 수 있습니다.
    # 결과 출력
    print('The sentence is predicted as:', results[prediction[0]])   
    #print('The sentence is predicted as:', results[prediction[0]])

In [184]:
# test_sentence = "The movie was absolutely fantastic! The storyline was captivating, the acting was superb, and the visuals were stunning."
# test_sentence = "I loved every minute of the movie. The characters were relatable, the humor was on point, and the ending was satisfying."
# test_sentence = "I was disappointed with the movie. The plot was confusing, the acting felt forced, and the pacing was sluggish."
# est_sentence = "Unfortunately, the movie didn't live up to my expectations. The dialogue was uninspiring, the special effects were underwhelming, and the ending felt abrupt."
# bagofwords(test_sentence, vocab)
# predictor(test_sentence)

test_sentence =["The movie was absolutely fantastic! The storyline was captivating, the acting was superb, and the visuals were stunning.",
                "I loved every minute of the movie. The characters were relatable, the humor was on point, and the ending was satisfying.",
                "I was disappointed with the movie. The plot was confusing, the acting felt forced, and the pacing was sluggish.",
                "Unfortunately, the movie didn't live up to my expectations. The dialogue was uninspiring, the special effects were underwhelming, and the ending felt abrupt."]

for test in test_sentence : 
    bagofwords(test, vocab)
    sentiment_predictor(test)

The sentence is predicted as: negative
The sentence is predicted as: positive
The sentence is predicted as: negative
The sentence is predicted as: negative


In [None]:
def sentiment_prediction(input_sentence):
    # CountVectorizer와 TfidfTransformer를 이용하여 입력 문장 변환
    input_data_features = vectorizer.transform([input_sentence])
    input_tfidf = tfidfier.transform(input_data_features)

    # 변환된 입력을 모델에 적용하여 감정 예측
    prediction = lf.predict(input_tfidf)
    results = {1: 'Positive', 0: 'Negative'} 

    # 결과 출력
    print('The sentence is predicted as:', results[prediction[0]])

test_sentence =[
                "The movie was absolutely fantastic! The storyline was captivating, the acting was superb, and the visuals were stunning.",
                "I loved every minute of the movie. The characters were relatable, the humor was on point, and the ending was satisfying.",
                "I was disappointed with the movie. The plot was confusing, the acting felt forced, and the pacing was sluggish.",
                "Unfortunately, the movie didn't live up to my expectations. The dialogue was uninspiring, the special effects were underwhelming, and the ending felt abrupt."]

for test in test_sentence : 
    sentiment_prediction(test)

## @방법3 : 라이브러리 사용하여 감정 분석 모델 만들고 테스트 - 간략화

In [156]:
# khs code- type1
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

def sentiment_predictor1(X_train, y_train, X_test, y_test):
    global vectorizer
    global tfidfier
    global lf
    
    vectorizer = CountVectorizer(analyzer="word", strip_accents=None, tokenizer=None, \
                                 preprocessor=None, stop_words=None, max_features=5000)
    train_data_features = vectorizer.fit_transform(X_train)
    test_data_features = vectorizer.transform(X_test)

    tfidfier = TfidfTransformer()
    train_tfidf = tfidfier.fit_transform(train_data_features)
    test_tfidf = tfidfier.transform(test_data_features)

    lf = LogisticRegression(solver = 'newton-cg', max_iter=200)
    lf.fit(train_tfidf, y_train)
    print('Model_Accuracy:' , lf.score(test_data_features, y_test))
    
    prediction = lf.predict(test_tfidf)       
    results = {1: 'Positive', 0: 'Negative'}  
    prediction_df = pd.DataFrame({'Predicted': prediction, 'Actual': y_test})
    # print(prediction_df)

    # Calculate the accuracy of the model
    y_pred = np.mean(prediction_df['Predicted'] == prediction_df['Actual'])
    print('prediction_Accuracy:', y_pred) 
    
df_raw = pd.read_pickle('[Dataset]_Module25_df_raw.pkl')
df_raw_test = pd.read_pickle('[Dataset]_Module25_df_raw_test.pkl')

X_train = df_raw["text"]
y_train = df_raw['positive'].values
X_test = df_raw_test["text"]
y_test = df_raw_test['positive'].values

sentiment_predictor1(X_train, y_train, X_test, y_test)

Model_Accuracy: 0.81704
prediction_Accuracy: 0.88256


In [157]:
# khs code- type1
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

def sentiment_predictor1(X_train, y_train, X_test, y_test):
    global vectorizer
    global tfidfier
    global lf
    
    vectorizer = CountVectorizer(analyzer="word", strip_accents=None, tokenizer=None, \
                                 preprocessor=None, stop_words=None, max_features=5000)
    train_data_features = vectorizer.fit_transform(X_train)
    test_data_features = vectorizer.transform(X_test)

    tfidfier = TfidfTransformer()
    tfidfier.fit(train_data_features)
    test_tfidf = tfidfier.transform(test_data_features)

    lf = LogisticRegression(solver = 'newton-cg', max_iter=200)
    lf.fit(train_data_features, y_train)
    print('Model_Accuracy:' , lf.score(test_data_features, y_test))
    
    prediction = lf.predict(test_tfidf)       
    results = {1: 'Positive', 0: 'Negative'}  
    prediction_df = pd.DataFrame({'Predicted': prediction, 'Actual': y_test})
    # print(prediction_df)

    # Calculate the accuracy of the model
    y_pred = np.mean(prediction_df['Predicted'] == prediction_df['Actual'])
    print('prediction_Accuracy:', y_pred) 
    
df_raw = pd.read_pickle('[Dataset]_Module25_df_raw.pkl')
df_raw_test = pd.read_pickle('[Dataset]_Module25_df_raw_test.pkl')

X_train = df_raw["text"]
y_train = df_raw['positive'].values
X_test = df_raw_test["text"]
y_test = df_raw_test['positive'].values

sentiment_predictor1(X_train, y_train, X_test, y_test)

Model_Accuracy: 0.85144
prediction_Accuracy: 0.80612


In [256]:
def sentiment_prediction(input_sentence):
    # CountVectorizer와 TfidfTransformer를 이용하여 입력 문장 변환
    input_data_features = vectorizer.transform([input_sentence])
    input_tfidf = tfidfier.transform(input_data_features)

    # 변환된 입력을 모델에 적용하여 감정 예측
    prediction = lf.predict(input_tfidf)
    results = {1: 'Positive', 0: 'Negative'} 

    # 결과 출력
    print('The sentence is predicted as:', results[prediction[0]])

test_sentence =[
                "The movie was absolutely fantastic! The storyline was captivating, the acting was superb, and the visuals were stunning.",
                "I loved every minute of the movie. The characters were relatable, the humor was on point, and the ending was satisfying.",
                "I was disappointed with the movie. The plot was confusing, the acting felt forced, and the pacing was sluggish.",
                "Unfortunately, the movie didn't live up to my expectations. The dialogue was uninspiring, the special effects were underwhelming, and the ending felt abrupt."]

for test in test_sentence : 
    sentiment_prediction(test)

The sentence is predicted as: Positive
The sentence is predicted as: Positive
The sentence is predicted as: Negative
The sentence is predicted as: Negative
