### HW4
Use their API to execute a query on a keyword(s) of your choice. Create a KMeans Clustering model based on the content of each article.  Use TF.IDF to process the text, and then Principal Component Analysis to reduce the dimensionality of the data, before you do any K-means fitting or plotting. 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt


API_ENDPOINT = "https://newsapi.org/v2/everything/"

In [None]:
def get_response(ENDPOINT):
    '''gets response from given API, params include key and query words. Return a 
    json object.
    '''
    API_KEY = ""
    params = {
        "apiKey": API_KEY,
        "q": '+United States employment'
    }
    url = ENDPOINT
    response = requests.get(url, params=params)
    print(response.url)
    return response.json()
data = get_response(API_ENDPOINT)

In [None]:
with open('data.json', 'w') as file:
    json.dump(data, file)

In [None]:
text = []
for article in data["articles"]: #Unpack the JSON object and extract corpus we need
    if "content" in article:
        text.append(article["content"])

In [None]:
#Use Tfidfvectorizer to get the product of term frequency and inverse document frequency for each word
vectorizer = TfidfVectorizer(stop_words = "english")
vectorized_data = vectorizer.fit_transform(text)
vectorized_data

In [None]:
pd.DataFrame(vectorized_data.todense(), columns=vectorizer.get_feature_names_out()).head()

In [None]:
#Perfrom Principle Component Analysis to reduce dimensions
pca = PCA(n_components=2)
components = pca.fit_transform(vectorized_data.toarray())
components.shape
components

In [None]:
#Custom function for finding the best n in reducing model inertia
def fit_KMeans(start_n, end_n):
    '''Fits KMeans with specified start value of n and end value of n. Returns
    a list of n value and inertia.
    '''
    performance = []
    for i in range(start_n, end_n + 1):
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(components)
        performance.append([i, kmeans.inertia_])
    return performance

scores = fit_KMeans(2, 20)
scores
df_scores = pd.DataFrame(data=scores, columns=["n", "Inertia"])
df_scores

In [None]:
#elbow plot for visualization the optimal cluster number
sns.lineplot(x=df_scores['n'], y=df_scores['Inertia'])
plt.title("Number of Clusters Against Inertia")
plt.xlabel("Number of clusters")
plt.savefig("Number of Clusters Against Inertia.png")

In [None]:
kmeans_2 = KMeans(n_clusters=4) #most optimal number of clusters
result = kmeans_2.fit(components)
predictions = []
for x in components:
    data = x.reshape(1, -1)
    predictions.append((data, kmeans_2.predict(data))) #get the predicted cluster for each data point
predictions #generate predictions for assigning colors
df_pred = pd.DataFrame(data=predictions, columns=["Data Point", "Cluster Label"])
#color code cluster label for visualization
df_pred['Color'] = df_pred['Cluster Label'].apply(lambda x: 'red' if x == 0 else
                                            'blue' if x == 1 else
                                            'green' if x == 2 else
                                            'gold')
df_pred

In [None]:
# #Extract Data from numpy array
df_pred['x'] = df_pred["Data Point"].map(lambda x: x[0, 0])
df_pred['y'] = df_pred["Data Point"].map(lambda x: x[0, 1])
df_pred

In [None]:
#plot scatterplot with color indicating their assigned cluster
plt.scatter(x=df_pred['x'], y=df_pred['y'], c=df_pred["Color"])
plt.title("Values of PCA Color-Coded to Indicate Cluster")
plt.savefig("color_coded_scatter.png")
plt.xlabel("Most significant loading")
plt.ylabel("Second most significant loading")
plt.show()


In [None]:
#How many data points are in each cluster
df_pred["Cluster Label"].value_counts()