In [1]:
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
# MultinomialNB
from sklearn.naive_bayes import MultinomialNB
# accuracy_score
from sklearn.metrics import accuracy_score, classification_report


file = 'bully_model_SGD.sav'

In [2]:
df = pd.read_csv('bullying_tweets.csv')

In [3]:
loaded_model = pickle.load(open(file, 'rb'))
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text'], df['bully_status'], random_state=42)


In [4]:
count_vector = CountVectorizer(stop_words = 'english', lowercase = True, ngram_range=(1,1))

training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [5]:
input = count_vector.transform(['needs stop bullying', 'happy morning', 'looking fat', 'you are so ugly', 'so beautiful', 'you are so stupid', 'I love you', 'good morning', 'you are the beautiful person'])

print(loaded_model.predict(input))

[0 0 1 1 1 1 0 0 1]


Lets try other model

In [6]:
classifier = MultinomialNB()
classifier.fit(training_data, y_train)

In [7]:
testPrediction = classifier.predict(testing_data)
print("Accuracy:", accuracy_score(y_test, testPrediction))
print("\nClassification Report:\n", classification_report(y_test, testPrediction))

Accuracy: 0.8517990438647991

Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.36      0.46      2062
           1       0.88      0.95      0.91      9861

    accuracy                           0.85     11923
   macro avg       0.75      0.66      0.69     11923
weighted avg       0.83      0.85      0.83     11923



In [8]:
from sklearn.model_selection import GridSearchCV
# make_scorer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.pipeline import Pipeline


In [9]:
def param_tuning(clf, param_dict, X_train, y_train, X_test, y_test):
    
    # make scorer object
    scorer = make_scorer(f1_score)

    # perform Grid Search for Parameters
    grid_obj = GridSearchCV(estimator = clf,
                           param_grid = param_dict,
                           scoring = scorer,
                           cv = 5)

    grid_fit = grid_obj.fit(X_train, y_train)

    # Get the estimator
    best_clf = grid_fit.best_estimator_

    # Make predictions using the unoptimized and model
    predictions = (clf.fit(X_train, y_train)).predict(X_test)
    best_predictions = best_clf.predict(X_test)
    
    # Report the before-and-afterscores
    print(clf.__class__.__name__)
    print("\nOptimized Model\n------")
    print("Best Parameters: {}".format(grid_fit.best_params_))
    print("Accuracy: {:.4f}".format(accuracy_score(y_test, best_predictions)))
    print("F1-score: {:.4f}".format(f1_score(y_test, best_predictions)))
    print("Precision: {:.4f}".format(precision_score(y_test, best_predictions)))
    print("Recall: {:.4f}".format(recall_score(y_test, best_predictions)))
    

In [10]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Define the parameter grid
parameters = {
    'vect__stop_words': (None, 'english'),
    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'clf__alpha': (1e-2, 1e-1, 1, 2, 5),    # smoothing parameter
    'clf__fit_prior': (True, False),        # whether to learn class prior probabilities or not
}

# Perform the grid search with cross-validation
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.858
Best parameters set:
	clf__alpha: 2
	clf__fit_prior: True
	vect__ngram_range: (1, 1)
	vect__stop_words: 'english'


In [11]:
param_grid = {
    'alpha': [2, 0.02, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001,10, 20, 1000],
    'fit_prior': [True, False]
}

param_tuning(classifier, param_grid, training_data, y_train, testing_data, y_test)


MultinomialNB

Optimized Model
------
Best Parameters: {'alpha': 2, 'fit_prior': True}
Accuracy: 0.8540
F1-score: 0.9171
Precision: 0.8648
Recall: 0.9760


In [12]:
classifier.alpha = 2
classifier.fit_prior = True
classifier.fit(training_data, y_train)


In [18]:
filename = 'bully_model_NB.sav'
import pickle

pickle.dump(classifier, open(filename, 'wb'))

In [19]:

loaded_model3 = pickle.load(open(file, 'rb'))
predictions = loaded_model3.predict(input)


In [20]:
print(predictions)

[0 0 1 1 1 1 0 0 1]
