<a href="https://colab.research.google.com/github/C-Crenshaw/Project1_DS4002/blob/main/DS_4002_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Exploring the Relationship Between Connotation of News Articles and Company Revenue Using Sentiment Analysis

##All Imports

In [None]:
#Standard imports
import pandas as pd
import numpy as np
import random

#Natural Language Toolkit (NLTK) import and pre-trained model and other resources download
import nltk
nltk.download('all')
#45s runtime

#Other NLKT imports
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#ML imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

##Data Cleaning

In [None]:
p = 0.01  # Take 1% of the population to generate a sample
nyt = pd.read_csv(r"content/nyt-metadata.csv", header=0, skiprows=lambda i: i>0 and random.random() > p, low_memory=False)
nyt = nyt.drop(columns=(['web_url', 'uri', '_id', 'byline', 'subsection_name', 'document_type','multimedia', 'source', 'snippet', 'keywords']))

nyt.info() # Check to make sure that changes have shown

# Clean headline column and create new column
headline = []
start = "'main': "
end = ", 'kicker': "
i = 0
for value in nyt["headline"]:
    idx1 = value.index(start)
    idx2 = value.index(end)
    headline = np.append(headline,value[idx1 + len(start) + 1: idx2-1])

# Create new column
nyt['headlines'] = headline

# Drop old headline column
nyt = nyt.drop(columns=(['headline']))

# Rearrange column order (personal preference)
new_cols = ['headlines', 'abstract', 'lead_paragraph', 'print_section', 'print_page', 'pub_date', 'news_desk', 'section_name', 'type_of_material', 'word_count']
nyt=nyt.reindex(columns=new_cols)

# Export new dataset
nyt.to_csv('cleaned-nyt-metadata.csv', index=False)

In [None]:
#Read in sample data
nyt = pd.read_excel(r"/content/nyt-metadata-SAMPLE.xlsx")
#15s runtime

#Select only relevant columns (drop blank trailing columns)
nyt = nyt.iloc[:,0:10]

#Convert headlines column to strings
nyt['headlines'] = nyt['headlines'].astype(str)

In [None]:
#Text preprocessing function
def textPreprocess(string):
  #Tokenizing text
  tokens = word_tokenize(string.lower())

  #Removing stop words
  filteredTokens = [token for token in tokens if token not in stopwords.words("english")]

  #Lemmatizing tokens
  lemmatizer = WordNetLemmatizer()
  lemmatizedTokens = [lemmatizer.lemmatize(token) for token in filteredTokens]

  #Joining tokens back together
  processedString = ' '.join(lemmatizedTokens)

  return processedString

In [None]:
#Initializing NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

#Sentiment fetching function
def sentimentFetch(string):
  scores = analyzer.polarity_scores(string)
  sentiment = scores["compound"]
  return sentiment

In [None]:
#Process headlines and create new sentiment column
nyt["compound_sentiment"] = nyt["headlines"].apply(textPreprocess).apply(sentimentFetch)
nyt.head()
#26s runtime

Unnamed: 0,headlines,abstract,lead_paragraph,print_section,print_page,pub_date,news_desk,section_name,type_of_material,word_count,compound_sentiment
0,"On This First Day, a Fanfare for the New Era; ...","Anne Lord Witt letter, replying to Joyce Carol...",To the Editor:,A,30,2000-01-01 05:00:00+00:00,Editorial Desk,Opinion,Letter,129,0.4404
1,Manifestoes To Give City A New Edge,Range of influential experts offer ideas on ki...,For anyone who thinks New York City institutio...,E,28,2000-01-01 05:00:00+00:00,The Millennium,Archives,News,134,0.0
2,"Paid Notice: Deaths CLOSE, MARY ''MOLLY'' G.","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...",1,35,2000-01-02 05:00:00+00:00,Classified,Archives,Paid Death Notice,128,-0.5994
3,Dec. 26 - Jan. 1; China Sentences 4 Members Of...,Sending a harsh message to followers of the Fa...,Sending a harsh message to followers of the Fa...,4,2,2000-01-02 05:00:00+00:00,Week in Review Desk,Week in Review,News,105,0.0772
4,What Do the Stars Say? Boot Up and Find Out,Fortune tellers in New York's ethnic communiti...,"THIS time of year, predictions are as plentifu...",14,3,2000-01-02 05:00:00+00:00,The City Weekly Desk,New York,News,285,0.0


In [None]:
#Positive, neutral, or negative sentiment definer function
def compoundToDirection(score):
  if score >= 0.05:
    return 1
  elif score <= -0.05:
    return -1
  return 0

In [None]:
#Create sentiment direction column
nyt["sentiment_direction"] = nyt["compound_sentiment"].apply(compoundToDirection)
nyt.head()

Unnamed: 0,headlines,abstract,lead_paragraph,print_section,print_page,pub_date,news_desk,section_name,type_of_material,word_count,compound_sentiment,sentiment_direction
0,"On This First Day, a Fanfare for the New Era; ...","Anne Lord Witt letter, replying to Joyce Carol...",To the Editor:,A,30,2000-01-01 05:00:00+00:00,Editorial Desk,Opinion,Letter,129,0.4404,1
1,Manifestoes To Give City A New Edge,Range of influential experts offer ideas on ki...,For anyone who thinks New York City institutio...,E,28,2000-01-01 05:00:00+00:00,The Millennium,Archives,News,134,0.0,0
2,"Paid Notice: Deaths CLOSE, MARY ''MOLLY'' G.","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...","CLOSE-Mary ''Molly'' G.., 94, of Niantic, CT, ...",1,35,2000-01-02 05:00:00+00:00,Classified,Archives,Paid Death Notice,128,-0.5994,-1
3,Dec. 26 - Jan. 1; China Sentences 4 Members Of...,Sending a harsh message to followers of the Fa...,Sending a harsh message to followers of the Fa...,4,2,2000-01-02 05:00:00+00:00,Week in Review Desk,Week in Review,News,105,0.0772,1
4,What Do the Stars Say? Boot Up and Find Out,Fortune tellers in New York's ethnic communiti...,"THIS time of year, predictions are as plentifu...",14,3,2000-01-02 05:00:00+00:00,The City Weekly Desk,New York,News,285,0.0,0


In [None]:
#Extract only publication year of article and create column
nyt["pub_year"] = nyt["pub_date"].str[:4]
nyt['pub_year'] = nyt['pub_year'].astype(str)

# Export sentiment dataset
nyt.to_csv("nyt-sentiment.csv",index=False)

###Sentiment Dataset Cleaning

In [None]:
#Read in sentiment data
nyt_s = pd.read_csv("/content/nyt-sentiment.csv")

#Convert pub_year column to pandas timestamp
nyt_s['pub_year'] = pd.to_datetime(nyt_s['pub_year'], errors='coerce')
#Drop any NaT values resulting from coerced conversion
nyt_s = nyt_s.dropna(axis=0,subset=['pub_year'])

###Revenue Dataset Cleaning

In [None]:
#Read in revenue data
nyt_r = pd.read_excel("/content/New York Times Revenue.xlsx")
#Override Year column to only inlcude year
nyt_r["Year"] = nyt_r["Year"].dt.year


##ML Dataset Creation

In [None]:
#Groupby year and find average compount sentiment and sentiment direction -> strore into dataframe and join by year
nyt_ml = nyt_s.groupby(nyt_s["pub_year"].dt.year)['compound_sentiment'].mean().to_frame().join(nyt_s.groupby(nyt_s["pub_year"].dt.year)['sentiment_direction'].mean().to_frame(), on="pub_year")
#Reset index so that Year is it's own column
nyt_ml = nyt_ml.reset_index()
nyt_ml = nyt_ml.rename(columns={"pub_year":"Year"})

#Merge with revenue dataset on Year column
nyt_ml = nyt_ml.merge(nyt_r[["Year","Revenue_Adjusted (in 2022 dollars)"]], on="Year")
#Rename adjusted revenue column
nyt_ml = nyt_ml.rename(columns={"Revenue_Adjusted (in 2022 dollars)":"Adjusted 2022 Revenue in Billions USD"})

#Keep columns relavent for ML
nyt_ml = nyt_ml.drop(columns=["Year"])

#Scale data
nyt_ml[["compound_sentiment","sentiment_direction"]] = MinMaxScaler().fit_transform(nyt_ml[["compound_sentiment","sentiment_direction"]])

##Machine Learning

In [None]:
#Divide into X & y
X = nyt_ml.drop(columns=["Adjusted 2022 Revenue in Billions USD"])
y = nyt_ml["Adjusted 2022 Revenue in Billions USD"]

In [None]:
#Find averages of MSE and RMSE for 1000 models
mse_avg = 0
rmse_avg = 0
for i in range(1000):
  #Split data
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
  #Create and train model
  mult_reg = LinearRegression()
  mult_reg.fit(X_train,y_train)

  #Generate Predictions
  predicted = mult_reg.predict(X_test)
  actual = np.array(y_test)

  #MSE & RMSE
  mse = mean_squared_error(predicted,actual)
  rmse = mean_squared_error(predicted,actual,squared=False)

  mse_avg += mse
  rmse_avg += rmse

mse_avg /= 1000
rmse_avg /= 1000
print("Average MSE is", round(mse_avg,5), "and Average RMSE is", round(rmse_avg,5))

Average MSE is 1.32165 and Average RMSE is 1.12832
