In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Define categories
categories = ['alt.atheism', 'comp.graphics', 'rec.motorcycles', 'sci.space', 'talk.politics.guns']

In [None]:
# Load the training and test datasets
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                      shuffle=True,
                                      random_state=2017,
                                      remove=('headers', 'footers', 'quotes'))

In [None]:
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories,
                                     shuffle=True,
                                     random_state=2017,
                                     remove=('headers', 'footers', 'quotes'))

In [None]:
# Extract targets
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [None]:
# Initialize and fit the TF-IDF vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, 
                             smooth_idf=True, 
                             max_df=0.5,  
                             ngram_range=(1, 2), 
                             stop_words='english')

In [None]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

In [None]:
# Print dataset details
print("Train Dataset")
print("%d documents" % len(newsgroups_train.data))
print("%d categories" % len(newsgroups_train.target_names))
print("n_samples: %d, n_features: %d" % X_train.shape)

In [None]:
print("Test Dataset")
print("%d documents" % len(newsgroups_test.data))
print("%d categories" % len(newsgroups_test.target_names))
print("n_samples: %d, n_features: %d" % X_test.shape)