# Using TF-IDF (Term frequency-inverse document frequency) to generate features for text

## Import data

In [None]:
import pandas as pd

In [None]:
article_df = pd.read_csv('data_news.csv',usecols=range(1,5))
article_df

## Generate feature words using tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

article_text_list = article_df["text"]
#article_text_list[0]

tfidf = TfidfVectorizer(stop_words='english')
x_train = tfidf.fit_transform(article_text_list)

In [None]:
# print out top 20 features
indices = np.argsort(tfidf.idf_)[::-1]
feature_names = tfidf.get_feature_names()

top_n = 20
top_feature_names = [feature_names[i] for i in indices[:top_n]]
print(top_feature_names)

In [None]:
x_train_df = pd.DataFrame(x_train.toarray().transpose(), index = feature_names)
#df.head()
x_train_df.tail()

## Test new data using exsting features

In [None]:
# new data
import newspaper

cnn_politics_articles = newspaper.build('http://www.cnn.com/politics',memoize_articles=False)

new_article = cnn_politics_articles.articles[100]
new_article.download()
new_article.parse()
print(new_article.title)
new_text = new_article.text

In [None]:
x_test = tfidf.transform([new_text])

x_test_df = pd.DataFrame(x_train.toarray().transpose(), index = feature_names)
#df.head()
x_test_df.tail()


In [None]:
for col in x_test.nonzero()[1]:
    print("%s - %s" %(feature_names[col], x_test[0, col]))