In [85]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import  make_scorer
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import CRF, scorers, metrics
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV

In [29]:
df = pd.read_csv('crf_file_real.csv', encoding = "ISO-8859-1")
#df = df[:100]
df.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Another,A
1,,violent,P
2,,and,P
3,,aggressive,P
4,,immigrant,P


In [30]:
df.isnull().sum()

Sentence #    286722
Word               2
Tag                0
dtype: int64

In [31]:
df = df.fillna(method='ffill')

In [32]:
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(7939, 28772, 2)

In [33]:
# df.head(50)

In [34]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,A,274071
1,P,20590


In [35]:
X = df.drop('Tag', axis=1)
X.head()

Unnamed: 0,Sentence #,Word
0,Sentence: 1,Another
1,Sentence: 1,violent
2,Sentence: 1,and
3,Sentence: 1,aggressive
4,Sentence: 1,immigrant


In [36]:
X.columns

Index(['Sentence #', 'Word'], dtype='object')

In [37]:
# v = DictVectorizer(sparse=False)
# X = v.fit_transform(X.to_dict('records'))
# X.shape

In [38]:
y = df.Tag.values

In [39]:
classes = np.unique(y)

In [40]:
classes = classes.tolist()
classes

['A', 'P']

In [41]:
X.shape, y.shape

((294661, 2), (294661,))

In [42]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [43]:
#X_train.shape, y_train.shape

In [44]:
new_classes = classes.copy()
#new_classes.pop()
new_classes

['A', 'P']

In [45]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [46]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(),  
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
#         print(self.sentences)
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [47]:
getter = SentenceGetter(df)

In [48]:
# df.head()

In [49]:
sent = getter.get_next()
#print(sent)

[('Another', 'A'), ('violent', 'P'), ('and', 'P'), ('aggressive', 'P'), ('immigrant', 'P'), ('killing', 'A'), ('a', 'A'), ('innocent', 'A'), ('and', 'A'), ('intelligent', 'A'), ('US', 'A'), ('Citizen', 'A'), ('Sarcasm', 'A'), ('.', 'A')]


In [50]:
sentences = getter.sentences

In [51]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [52]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [53]:
# for i in range (0,len(y)):
#     for j in range (0,len(y[i])):
#         if y[i][j]==0:
#             y[i][j]='O'
#         else :
#             y[i][j]='P'

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [61]:
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     c1=0.1,
#     c2=0.1,
#     max_iterations=100,
#     all_possible_transitions=True
# )
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=new_classes)
# crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [56]:
# print(X_train[0])
# print(y_train[0])

[{'bias': 1.0, 'word.lower()': 'this', 'word[-3:]': 'his', 'word[-2:]': 'is', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'woman', '+1:word.istitle()': False, '+1:word.isupper()': False}, {'bias': 1.0, 'word.lower()': 'woman', 'word[-3:]': 'man', 'word[-2:]': 'an', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'this', '-1:word.istitle()': True, '-1:word.isupper()': False, '+1:word.lower()': 'is', '+1:word.istitle()': False, '+1:word.isupper()': False}, {'bias': 1.0, 'word.lower()': 'is', 'word[-3:]': 'is', 'word[-2:]': 'is', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'woman', '-1:word.istitle()': False, '-1:word.isupper()': False, '+1:word.lower()': 'shameless', '+1:word.istitle()': False, '+1:word.isupper()': False}, {'bias': 1.0, 'word.lower()': 'shameless', 'word[-3:]': 'ess', 'word[-2:]': 'ss', 'word.isupper()': False, 'wor

In [62]:
# y_pred = crf.predict(X_test)
# metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

0.9291046155107456

In [66]:
#Lets evaluate the mode
# report = flat_classification_report(y_pred=y_pred, y_true=y_test)
# print(report)



              precision    recall  f1-score   support

           A       0.95      0.99      0.97     55042
           P       0.66      0.29      0.40      4108

    accuracy                           0.94     59150
   macro avg       0.80      0.64      0.69     59150
weighted avg       0.93      0.94      0.93     59150



In [80]:
# print(type(X_train))
# X_train1=X_train[:100]
# y_train1=y_train[:100]

<class 'list'>


In [81]:
crf3 = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=new_classes)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  6.7min finished


RandomizedSearchCV(estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True, c1=0.1, c2=0.1,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f5d2e825790>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f5d2e8550d0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['A', 'P']),
                   verbose=1)

In [82]:
#Lets check the best estimated parameters and CV score
print('Best parameters:', rs.best_params_)
print('Best CV score:', rs.best_score_)
print('Model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

Best parameters: {'c1': 0.019493421450778304, 'c2': 0.006944928951472884}
Best CV score: 0.9206737637084466
Model size: 0.10M


In [84]:
#Now we create the model again using the best estimators
crf3 = rs.best_estimator_
y_pred = crf3.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=new_classes, digits=3))



              precision    recall  f1-score   support

           A      0.939     0.994     0.966     55042
           P      0.631     0.138     0.226      4108

    accuracy                          0.935     59150
   macro avg      0.785     0.566     0.596     59150
weighted avg      0.918     0.935     0.914     59150



In [136]:
# for i in range(0,len(X_test[0])):
#     print(X_test[0][i]["word.lower()"],end=" ")
# print(y_pred[0])
# print(len(X_test[0]), len(y_pred[0]))

are u stupid did you not read article small amount of marijuana massive amounts of meth and herion yep blame it on weed you obviously never smoked any if so you would realize a "reffer" head would have been to lazy and hungry to steal a boat as a matter of fact i guarantee alcohol which is legal same as marijuana played huge part in this not defending the meth heads or tweakers out there but really your simple minded antics are what is wrong with our community . ['A', 'A', 'P', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']
89 89


In [142]:
# print(X_test[1])

[{'bias': 1.0, 'word.lower()': 'one', 'word[-3:]': 'One', 'word[-2:]': 'ne', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'idiot', '+1:word.istitle()': False, '+1:word.isupper()': False}, {'bias': 1.0, 'word.lower()': 'idiot', 'word[-3:]': 'iot', 'word[-2:]': 'ot', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'one', '-1:word.istitle()': True, '-1:word.isupper()': False, '+1:word.lower()': 'so', '+1:word.istitle()': False, '+1:word.isupper()': False}, {'bias': 1.0, 'word.lower()': 'so', 'word[-3:]': 'so', 'word[-2:]': 'so', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'idiot', '-1:word.istitle()': False, '-1:word.isupper()': False, '+1:word.lower()': 'far', '+1:word.istitle()': False, '+1:word.isupper()': False}, {'bias': 1.0, 'word.lower()': 'far', 'word[-3:]': 'far', 'word[-2:]': 'ar', 'word.isupper()': False, 'word.istitle()': 

In [141]:
import joblib
  
# Save the model as a pickle in a file
joblib.dump(crf3, 'crf_model_real.pkl')
  
# # Load the model from the file
# knn_from_joblib = joblib.load('filename.pkl') 
  
# # Use the loaded model to make predictions
# knn_from_joblib.predict(X_test)

['crf_model1.pkl']

In [31]:
#!pip install -U 'scikit-learn<0.24'

Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 168 kB/s eta 0:00:01     |███████████████████             | 4.0 MB 489 kB/s eta 0:00:06
[31mERROR: imbalanced-learn 0.8.0 has requirement scikit-learn>=0.24, but you'll have scikit-learn 0.23.2 which is incompatible.[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed scikit-learn-0.23.2
