-
Notifications
You must be signed in to change notification settings - Fork 0
/
refuzzGW.py
244 lines (212 loc) · 9.74 KB
/
refuzzGW.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import pandas as pd
import numpy as np
import pickle
import time
import freqWordSelection as fws
# Importing all Classified Data
# pdataset = pd.read_csv('TestOnly/positive_tweets.csv', names=['tweet', 'classified'])
# ndataset = pd.read_csv('TestOnly/negative_tweets.csv', names=['tweet', 'classified'])
# nudataset = pd.read_csv('TestOnly/neutral_tweets.csv', names=['tweet', 'classified'])
# ---------------------------------------
import linecache
import sys
def PrintException():
exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame
lineno = tb.tb_lineno
filename = f.f_code.co_filename
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)
try:
# Merging Positives and Negatives for Analysis
mainset = pd.read_csv("GlobalWarming/tweet_global_warming.csv", names=['tweet', 'classified'])
mainset = mainset.iloc[1:, :]
mainset['classified'] = mainset['classified'].map({'Positive': 1, 'Negative': 0, 'Neutral': -1})
mainset = mainset.dropna().reset_index(drop=True) # RESOLVER #01
except KeyError as e:
print("Check parameters / headers of CSV file:%s" % e)
exit(0)
import SL_NB_Processor as cnp
import plotter as pltr
try:
machine, X, y = cnp.processor(mainset)
print("\nSupervised Learning: Naive Bayes \nResults:")
prediction = cnp.sl_prediction(machine, X, y)
print"SL: NBC - Conf. Matrix".center(45,'_'), "\n", prediction, "\n"
pltr.bars(prediction, plt_name="SL - Naive Bayes Classifier \n For Global Warming")
# UN-SUPERVISED LEARNING ALGORITHM
import USL_NB_Processor as cnp
machine, X, y = cnp.processor(mainset)
# No Train Test Split and hence no need of y
del y
pred_df = cnp.usl_prediction(machine, X)
print("Un-Supervised Learning: Naive Bayes \n Results:")
print"USL: NBC - Predictions".center(45, '_'), "\n", prediction, "\n"
print(pred_df.head())
pred_df.to_csv('NBpredictions')
gt = mainset['classified'].value_counts()
pr = pred_df.iloc[:, -1].value_counts()
pltr.biplt(gt, pr, "UnSupervised Naive Bayes \n For Global Warming")
# RFG
import SL_RanForGen as rfg_cl
rfg = rfg_cl
print("Supervised Learning: RANDOM FOREST GENERATION \n Results:")
machine, X, y = rfg.read_fit(mainset)
prediction = rfg_cl.rfg_spv_predict(machine, X, y)
print"SL: RFG - Conf. Matrix".center(45, '_'), "\n", prediction, "\n"
pltr.bars(prediction, plt_name="SL - Random Forest Classifier \n For Global Warming")
import USL_RanForGen as rfg_c
rfg = rfg_c
print("Un-supervised Learning: RANDOM FOREST GENERATION \n Results:")
machine, X, y = rfg.read_fit(mainset)
del y # No training
pred_df_rf = rfg_c.rfg_usp_predict(machine, X)
print"USL: NBC - Predictions".center(45, '_'), "\n", pred_df_rf.head(), "\n"
pr = pred_df_rf.iloc[:, -1].value_counts()
pltr.biplt(gt, pr, "UnSupervised Random Forest Gen. \n For Global Warming")
# ----------------------------- FUZZY KITCHEN -----------------------------
import nltk
nltk.download('averaged_perceptron_tagger')
all_tweets = " "
tweets = mainset.iloc[0:, 0]
print(tweets[0])
for i in range(mainset.__len__()):
tw = mainset.iloc[i, 0]
if not isinstance(tw, float):
all_tweets += tw
print tw, "\n", type(all_tweets) # ISSUE #01 RESOLVED
# freqD is Frequency of the Words used in Tweets
# freqD = nltk.FreqDist(nltk.word_tokenize(all_tweets))
# for k, v in freqD.items(): # Finding most Freq. words
# if v > 25:
# print k
# OPTIONAL F IN CASE OF GARBAGE OCCURRED IN EXISTING
# D is Synonyms
D = pd.read_csv('GlobalWarming/synonym_global_warming.csv')
D = list(D.iloc[0:,0])
# F is Popular tags
F = pd.read_csv("TestOnly/ptag_area.csv",)
F = pd.Series(F.iloc[:, 0])
F = list(F)
# ser_obj = open('pop_tags', 'r')
# F = pickle.load(ser_obj)
# -------------------------------------------------------- COMMIT STEP 1
nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sia = SentimentIntensityAnalyzer()
# for twe in nltk.word_tokenize(all_tweets):
# scores = sia.polarity_scores(text=twe)
# print twe
# print "POS:", scores.get('pos')
# print "NEG:", scores.get('neg')
# print "NEU:", scores.get('neu')
# -------------------------------------------------------- COMMIT STEP 2 (For every word in one tweet)
fuzzy_df = pd.DataFrame(columns=['tweets', 'classified'])
for i in range(len(tweets)):
sent = nltk.word_tokenize(tweets[i].decode('unicode_escape').encode('ascii','ignore'))
print(i)
PoS_TAGS = nltk.pos_tag(sent)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
one_sentence = tweets.iloc[i]
scores = sia.polarity_scores(text=one_sentence)
print "POS:", scores.get('pos')
print "NEG:", scores.get('neg')
print "NEU:", scores.get('neu')
POS = scores.get('pos')
NEG = scores.get('neg')
NEU = scores.get('neu')
RES = str()
if POS > NEG:
RES = 'Positive'
elif NEG > POS:
RES = 'Negative'
elif NEU >= 0.5:
RES = 'Positive'
elif NEU < 0.5:
RES = 'Negative'
# -------------------------------------------------------- PATTERN ADVERB, ADVERB, ADJECTIVE (Down)
tri_pairs = list()
for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS):
if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith("JJ"):
tri_pairs.append((w1, w2, w3))
if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D:
print("[True]: Tri Pairs are found in Drought Rel. Term")
if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F:
print("[True]: Tri Pairs are found in Frequent Wordset")
if RES is "Positive":
RES = "Highly Positive"
elif RES is "Negative":
RES = "Highly Negative"
else:
print"[False]: Doesn't Match with Frequent Wordset\n"
else:
print"[False]: Tri Pairs Matched Nowhere in D\n"
else:
print "[TriPair(F)]: Pattern for Adverb, Adverb, Adjective did not match.\n Looking for Bi-Pair Patterns\n"
print(tri_pairs)
# -------------------------------------------------------- PATTERN ADVERB, ADJECTIVE (Down)
bi_pairs = list()
for (w1, tag1), (w2, tag2) in nltk.bigrams(PoS_TAGS):
if tag1.startswith("RB") and tag2.startswith("JJ"):
bi_pairs.append((w1, w2))
if bi_pairs[0] or bi_pairs[1] in D:
print("[True]: Bi Pairs are found in Drought Rel. Term")
if bi_pairs[0] or bi_pairs[1] in F:
print("[True]: Bi Pairs are found in Frequent Wordset")
if RES is "Positive":
RES = "Moderately Positive"
elif RES is "Negative":
RES = "Moderately Negative"
else:
print("[False]: Bi Pairs found missing in Freq. Wordset")
else:
print("[False]: Bi Pairs Matched Nowhere in D")
else:
print("[BiPair(F)]: Pattern Not Matched, Looking for Mono Pattern")
print(bi_pairs)
# -------------------------------------------------------- PATTERN ADJECTIVE (Down)
for w, tag in PoS_TAGS:
print w, " - ", tag
if tag.startswith("JJ"):
if w in D:
print("Matched with D")
if w in F:
print("Matched with F")
if RES is "Positive":
RES = "Positive"
elif RES is "Negative":
RES = "Negative"
else:
print("Couldn't Match with F")
else:
print("the")
else:
print w, "is not an ADJECTIVE"
# -------------------------------------------------------- MAKING ENTRY OF RECORDS OF TWEETS and POLARITY RESULT
fuzzy_df = fuzzy_df.append({'tweets': tweets[i], 'classified': RES}, ignore_index=True) # ADDING RECORDS IN DATAFRAME
fuzzy_df.to_csv("GlobalWarming/ReFuzzy.csv", index=False)
fws_df = fws.findFreqWord(fuzzyDF=fuzzy_df)
sum_df = pd.get_dummies(fws_df[['Classified', 'FreqWord']], columns=['FreqWord']).set_index('Classified').sum(
level=0)
sum_df.columns = sum_df.columns.str.split('_').str[1]
sum_df.to_csv('GlobalWarming/ClassFreq.csv')
# sum_df = pd.crosstab(fws_df.Classified, fws_df.FreqWord)
PS = (fuzzy_df['classified'] == 'Positive').sum()
H_PS = (fuzzy_df['classified'] == 'Highly Positive').sum()
M_PS = (fuzzy_df['classified'] == 'Moderately Positive').sum()
NG = (fuzzy_df['classified'] == 'Negative').sum()
H_NG = (fuzzy_df['classified'] == 'Highly Negative').sum()
M_NG = (fuzzy_df['classified'] == 'Moderately Negative').sum()
text = "Fuzzy Logic Stats"
pltr.stackplotter(H_NG, M_NG, NG, H_PS, M_PS, PS, text)
pltr.simple_plot(dataframe=sum_df)
import os
try:
os.system("libreoffice --calc ReFuzzy.csv")
except:
print("This Feature works with Debian Based OS with Libre Office only")
except Exception as e:
print "[Refuzz]:", e
PrintException()