In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('tokenized.csv')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import numpy as np

# Create a TfidfVectorizer object for the article_tokens column
tfidf_article = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_article.fit(data['article_tokens'])

# Fit and transform the article_tokens column
tfidf_article_tokens = tfidf_article.transform(data['article_tokens'])




# Create a TfidfVectorizer object for the highlights_tokens column
tfidf_highlights = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_highlights.fit(data['highlights_tokens'])

# Fit and transform the highlights_tokens column
tfidf_highlights_tokens = tfidf_highlights.transform(data['highlights_tokens'])






In [5]:
print(tfidf_article_tokens)

  (0, 73)	0.0022646937072834574
  (0, 72)	0.02234304627210692
  (0, 71)	0.007400751739817798
  (0, 70)	0.01546816618579909
  (0, 69)	0.006875871404560421
  (0, 68)	0.034373559498124893
  (0, 67)	0.15296138433225956
  (0, 66)	0.11686937229880506
  (0, 65)	0.08421469474472718
  (0, 63)	0.05156098336629842
  (0, 62)	0.1615547205307011
  (0, 61)	0.10312003438129859
  (0, 60)	0.041248013752519436
  (0, 59)	0.0532786844303376
  (0, 58)	0.013757126710297067
  (0, 57)	0.0071799881584349405
  (0, 56)	0.11858803953849337
  (0, 55)	0.09108974301411923
  (0, 54)	0.024061441550899423
  (0, 53)	0.05156044659981371
  (0, 52)	0.18217872740696084
  (0, 51)	0.08421539611302906
  (0, 50)	0.06015360387724856
  (0, 49)	0.029219168113049143
  (0, 48)	0.13749337917506477
  :	:
  (480288, 55)	0.07980234125044912
  (480288, 54)	0.032876316641910375
  (480288, 53)	0.02866152360258054
  (480288, 52)	0.18011650609831645
  (480288, 51)	0.07109181834757722
  (480288, 50)	0.03371929911990808
  (480288, 49)	0.0289440

In [6]:
batch_size = 500

# Standardize the article_tokens column
scaler_article = StandardScaler()
scaled_article = []

for i in range(0, len(tfidf_article_tokens.toarray()), batch_size):
    batch = tfidf_article_tokens.toarray()[i:i+batch_size]
    scaled_batch = scaler_article.fit_transform(batch)
    scaled_article.append(scaled_batch)

scaled_article = np.concatenate(scaled_article, axis=0)



In [7]:
# Standardize the highlights_tokens column
scaler_highlights = StandardScaler()
scaled_highlights = []

for i in range(0, len(tfidf_highlights_tokens.toarray()), batch_size):
    batch = tfidf_highlights_tokens.toarray()[i:i+batch_size]
    scaled_batch = scaler_highlights.fit_transform(batch)
    scaled_highlights.append(scaled_batch)

scaled_highlights = np.concatenate(scaled_highlights, axis=0)

In [8]:
print(scaled_article)

[[-1.13807199 -0.28052357  0.         ...  0.          0.
   0.        ]
 [-0.85052805 -0.28052357  0.         ...  0.          0.
   0.        ]
 [ 0.08654211 -0.28052357  0.         ...  0.          0.
   0.        ]
 ...
 [ 2.08390261 -0.24341082 -0.55047126 ...  0.          0.
   0.        ]
 [-0.25012622  0.35562961 -1.17121525 ...  0.          0.
   0.        ]
 [ 0.01723066 -0.24341082 -0.9904161  ...  0.          0.
   0.        ]]


In [9]:
print(scaled_highlights)

[[-0.2646441  -0.04476615  0.         ...  0.          0.
   0.        ]
 [ 0.25980248 -0.04476615  0.         ...  0.          0.
   0.        ]
 [-0.04730716 -0.04476615  0.         ...  0.          0.
   0.        ]
 ...
 [ 0.4201044   0.         -0.72204012 ...  0.          0.
   0.        ]
 [-0.0683246   0.         -1.268546   ...  0.          0.
   0.        ]
 [ 1.15416524  0.         -1.4074981  ...  0.          0.
   0.        ]]


In [10]:
# Perform dimensionality reduction using TruncatedSVD
svd_article = TruncatedSVD(n_components=317, random_state=42)
reduced_article = svd_article.fit_transform(scaled_article)



In [11]:
print(len(reduced_article))

480289


In [12]:
print(reduced_article)

[[-4.16576495e-01  4.95353162e+00  2.71360777e+00 ... -3.46368399e-18
  -4.63624262e-19 -9.29189588e-16]
 [-3.65752862e+00  2.90025064e-01  1.16171038e-01 ... -7.82857933e-19
   2.57343350e-19  1.82435651e-16]
 [ 9.43802209e-02 -8.03207253e-01  7.87193813e-01 ... -2.92011896e-19
   3.37394088e-19  4.20398120e-17]
 ...
 [ 3.21083777e+00 -2.71452968e+00  1.22687363e+00 ...  4.91880553e-19
   6.03311568e-21 -5.54193930e-16]
 [-1.14604454e+00 -1.52057313e+00 -1.24886102e+00 ... -1.66470815e-18
   3.06327342e-19 -2.05587108e-16]
 [-1.36363002e-02 -1.70944384e+00 -7.85274348e-01 ... -1.10913340e-19
  -5.84196017e-19  1.19536364e-16]]


In [13]:
# Perform dimensionality reduction using TruncatedSVD
svd_highlights = TruncatedSVD(n_components=50, random_state=42)
reduced_highlights = svd_highlights.fit_transform(scaled_highlights)

In [14]:
print(len(reduced_highlights))

480289


In [15]:
print(reduced_article[600:1000])

[[ 5.50218811e+00  3.53316599e+00 -1.86026073e+00 ... -6.35880826e-19
  -4.58142392e-19  1.14388819e-16]
 [-3.65698475e-01 -1.96884342e+00 -1.38462062e+00 ...  1.81810774e-19
   1.14682633e-19  3.57817706e-16]
 [-7.57376042e+00  3.61295638e+00  9.68678003e-01 ... -2.18031519e-18
  -3.42201359e-19 -6.60081051e-16]
 ...
 [ 2.80401548e+00 -1.14243896e+00  1.00624196e+00 ...  3.99981834e-18
   1.03481763e-18 -1.63050589e-17]
 [ 3.49849732e+00  1.45947725e+00  1.94373410e-01 ...  1.86014662e-18
  -1.74289515e-18  3.61224387e-16]
 [ 3.51244292e-01  2.84661281e+00 -8.68020310e-01 ...  3.22946768e-18
  -2.83815037e-19  4.12094919e-16]]


In [16]:
reconstructed_highlights = svd_highlights.inverse_transform(reduced_highlights)


In [18]:
print('''The Tesla CEO, Elon Musk has fallen to the third spot in the list of 
    the Forbes billionaire rankings after a significant fall in 
    Tesla shares recently. The billionaire lost close to $6.2 billion 
    by the end of the day on Wednesday. 
    Musk's overall wealth has fallen down dramatic since January when 
    the Tesla CEO was valued at around $197 billion and after the 
    consecutive fall within the last few weeks, Musk is now valued at 
    $150.900 billion, as per Forbes's real-time billionaire's list.
    While Amazon's CEO Jeff Bezos appears to be unshakable at the number 
    one position at $175.4 billion, the second spot has now been claimed 
    by the French billionaire Bernard Arnault who is the owner conglomerate 
    LVMH , with a fortune of $ 155.600 billion. For the record, the LVMH group 
    encompasses over 70 luxury brands including Fendi, Givenchy, Louis Vuitton, 
    Christian Dior, Marc Jacobs, Fenty Beauty, 
    Bvlgari, Tiffany & Co. and Sephora.''')

The Tesla CEO, Elon Musk has fallen to the third spot in the list of 
    the Forbes billionaire rankings after a significant fall in 
    Tesla shares recently. The billionaire lost close to $6.2 billion 
    by the end of the day on Wednesday. 
    Musk's overall wealth has fallen down dramatic since January when 
    the Tesla CEO was valued at around $197 billion and after the 
    consecutive fall within the last few weeks, Musk is now valued at 
    $150.900 billion, as per Forbes's real-time billionaire's list.
    While Amazon's CEO Jeff Bezos appears to be unshakable at the number 
    one position at $175.4 billion, the second spot has now been claimed 
    by the French billionaire Bernard Arnault who is the owner conglomerate 
    LVMH , with a fortune of $ 155.600 billion. For the record, the LVMH group 
    encompasses over 70 luxury brands including Fendi, Givenchy, Louis Vuitton, 
    Christian Dior, Marc Jacobs, Fenty Beauty, 
    Bvlgari, Tiffany & Co. and Sephora.


In [None]:
print('''The Tesla CEO, Elon Musk has fallen to the third spot in the list
of the Forbes billionaire rankings after a significant fall in Tesla shares 
recently.<n>The billionaire lost close to $6.2 billion by the end of the day on Wednesday.
<n>While Amazon's CEO Jeff Bezos appears to be unshakable at the number one position 
at $175.4 billion, the second spot has now been claimed by the French billionaire Bernard Arnault.''')

In [None]:
print('''"Musk's overall wealth has fallen down dramatic since January when the Tesla CEO was valued
at around $197 billion and after the consecutive fall within the last few weeks, 
Musk is now valued at $150.900 billion, 
as per Forbes's real-time billionaire's list.")