In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    text = text.lower()  # convert to lowercase
    text = ''.join([c for c in text if c not in string.punctuation])  # remove punctuation
    tokens = word_tokenize(text)  # tokenize
    tokens = [token for token in tokens if token not in stop_words]  # remove stopwords
    tokens = [stemmer.stem(token) for token in tokens]  # stem
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to /Users/jinho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jinho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Load your data
df = pd.read_csv('../news_data/Tesla/tesla_news_raw.csv')

# Define a TF-IDF Vectorizer Object. Remove all english stop words
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
df = df.fillna('')

# Perform the necessary preprocessing steps for the 'description' column
df['description'] = df['description'].apply(lambda x: preprocess(x))

# Construct the required TF-IDF matrix by applying the fit_transform method on the 'description' column
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
print(tfidf_matrix.shape)

# To get the feature names (here, the words)
feature_names = tfidf.get_feature_names_out()

# Assuming you want top 10 keywords per document
top_n = 10
keywords_list = []

# iterate through each document
for doc in range(tfidf_matrix.shape[0]):
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
    words_scores = [(feature_names[i], s) for (i, s) in tfidf_scores]
    sorted_words_scores = sorted(words_scores, key=lambda x: x[1], reverse=True)
    keywords = sorted_words_scores[:top_n]
    keywords_list.append(keywords)

df['keywords'] = keywords_list


(41604, 2266)


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Let's say we have some texts
texts = ["model_3 is selling well"]

# Initialize the vectorizer and fit it with data
vectorizer = TfidfVectorizer().fit(texts)

# Now we can access the feature names
feature_names = vectorizer.get_feature_names_out()

# This will output: ['document', 'first', 'is', 'second', 'the', 'this']
print(feature_names)


['is' 'model_3' 'selling' 'well']


In [14]:
def is_related(sentence):
    sentence = preprocess(sentence)
    sentence_tokens = word_tokenize(sentence)

    for word in sentence_tokens:
        if word in feature_names:
            return True

    return False

In [16]:
import numpy as np

# Let's get the top 10 keywords
top_n = 10

# Sum the TF-IDF scores for each term through all documents
sum_tfidf = np.sum(tfidf_matrix, axis=0)

# Get tuples of (score, word) and sort by the score in descending order
words_scores = [(score, word) for word, score in zip(feature_names, np.ravel(sum_tfidf))]
sorted_words_scores = sorted(words_scores, key=lambda x: x[0], reverse=True)

# Get the top n words
top_keywords = sorted_words_scores[:top_n]

for score, word in top_keywords:
    print(f'Word: {word}, TF-IDF: {score}')


Word: well, TF-IDF: 99.50341683926783
Word: model_3, TF-IDF: 9.910367613139847
Word: is, TF-IDF: 2.9028845001182306
Word: selling, TF-IDF: 2.0128372239843264


In [15]:
print(keywords_list[:10])

[['indian', 'soon', 'possibl', 'india', 'modi', 'sector', 'mobil', 'mr', 'commerci', 'musk'], ['india', 'modi', 'popul', 'invest', 'meet', 'promot', 'indian', 'offic', 'chief', 'narendra'], ['nomenclatur', 'nicknam', 'nag', 'select', 'configur', 'research', 'fsd', 'secret', 'wheel', 'intern'], ['kia', 'ev5', '2027', 'autocar', 'hilbert', 'europ', 'gone', '15', 'david', 'introduc'], ['cramer', 'believ', 'mutual', 'doesnt', 'bet', 'ford', 'exclus', 'think', 'buy', 'valu'], ['rivian', 'inch', 'join', 'network', 'closer', 'becom', 'charg', 'follow', 'say', 'maker'], ['handsfre', 'hacker', 'discov', 'greentheonli', 'aptli', 'supersecret', 'onlin', 'mode', 'elon', 'hidden'], ['india', 'modi', 'starlink', 'bring', 'distant', 'say', 'musk', 'look', 'humanli', 'someth'], ['microsoft', 'club', 'trilliondollar', 'alphabet', 'aramco', 'saudi', 'trillion', '123', 'amazon', 'mileston'], ['india', 'invest', 'look', 'humanli', 'soon', 'possibl', 'chief', 'modi', 'elon', 'musk']]
