## Balancing imbalance data distribution

In [20]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from imblearn.metrics import classification_report_imbalanced

newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test")

In [5]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data

y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [6]:
from collections import Counter

print(f"Training class distributions summary: {Counter(y_train)}")
print(f"Test class distributions summary: {Counter(y_test)}")


Training class distributions summary: Counter({10: 600, 15: 599, 8: 598, 9: 597, 11: 595, 7: 594, 13: 594, 14: 593, 5: 593, 2: 591, 12: 591, 3: 590, 6: 585, 1: 584, 4: 578, 17: 564, 16: 546, 0: 480, 18: 465, 19: 377})
Test class distributions summary: Counter({10: 399, 15: 398, 8: 398, 9: 397, 7: 396, 13: 396, 11: 396, 5: 395, 2: 394, 14: 394, 12: 393, 3: 392, 6: 390, 1: 389, 4: 385, 17: 376, 16: 364, 0: 319, 18: 310, 19: 251})


In [17]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
accuracy_score(y_pred, y_test)

0.7738980350504514

In [19]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.52      0.99      0.63      0.72      0.49       319
          1       0.81      0.65      0.99      0.72      0.80      0.62       389
          2       0.82      0.65      0.99      0.73      0.81      0.63       394
          3       0.67      0.78      0.98      0.72      0.87      0.75       392
          4       0.86      0.77      0.99      0.81      0.88      0.75       385
          5       0.89      0.75      0.99      0.82      0.87      0.73       395
          6       0.93      0.69      1.00      0.80      0.83      0.67       390
          7       0.85      0.92      0.99      0.88      0.95      0.90       396
          8       0.94      0.93      1.00      0.93      0.96      0.92       398
          9       0.92      0.90      1.00      0.91      0.95      0.89       397
         10       0.89      0.97      0.99      0.93      0.98      0.96       399
   

In [12]:
model = make_pipeline_imb(TfidfVectorizer(), RandomOverSampler(), MultinomialNB())

model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [13]:
accuracy_score(y_pred, y_test)

0.7963356346255974

In [16]:
c

                   pre       rec       spe        f1       geo       iba       sup

          0       0.68      0.76      0.98      0.72      0.86      0.73       319
          1       0.78      0.67      0.99      0.72      0.82      0.65       389
          2       0.82      0.65      0.99      0.73      0.80      0.63       394
          3       0.69      0.76      0.98      0.72      0.87      0.73       392
          4       0.85      0.80      0.99      0.83      0.89      0.78       385
          5       0.90      0.75      1.00      0.82      0.86      0.73       395
          6       0.92      0.70      1.00      0.80      0.84      0.68       390
          7       0.89      0.90      0.99      0.89      0.94      0.88       396
          8       0.95      0.92      1.00      0.94      0.96      0.92       398
          9       0.94      0.89      1.00      0.91      0.94      0.87       397
         10       0.91      0.96      0.99      0.94      0.98      0.96       399
   

In [23]:
model = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), LinearSVC())

model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [24]:
accuracy_score(y_pred, y_test)

0.8397503983005842

In [None]:
from imblearn.ensemble import RUSBoostClassifier
rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
                              random_state=0)
model = make_pipeline(TfidfVectorizer(),  RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
                              random_state=0))


rusboost.fit(training_features.toarray(), y_train)  

y_pred = rusboost.predict(testing_features.toarray())
balanced_accuracy_score(y_test, y_pred) 

In [39]:
vectorizer = TfidfVectorizer()

training_features=vectorizer.fit_transform(np.array(X_train))
testing_features = vectorizer.transform(X_test)