## First steps

In [1]:
import os
import glob

import pandas as pd
import numpy as np
import re

from textblob import TextBlob, Word
from langdetect import detect

In [2]:
# Get CSV files list from a folder
path = os.getcwd()
csv_files = glob.glob(path + "/*.csv")

# Read each CSV file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_csv(file, sep='\t') for file in csv_files)

# Concatenate all DataFrames
df = pd.concat(df_list, ignore_index=True)

### Rename some columns

In [3]:
df.columns = ['date', 'title', 'subtitle', 'claps', 'responses', 'author_handle',
       'story_url', 'reading_time', 'text', 'author_followers',
       'publication']

------

------

In [6]:
df = pd.read_csv('raw_full_dataset.csv')

### Dropping rows

- There are some duplicate values in the dataset from articles that where posted multiple times and so these will be dropped.
- Similarly articles without a story_url won't have text.
- Finally I will drop author_followers with null values

In [7]:
df.drop_duplicates(subset=['story_url'], inplace=True)
df.drop_duplicates(subset=['title'], inplace=True)
df = df[df.story_url != '-']
df = df[df.author_followers != '-']

df.reset_index(inplace=True, drop=True)

In [8]:
df.head()

Unnamed: 0,date,title,subtitle,claps,responses,author_handle,story_url,reading_time,text,author_followers,publication
0,01/01/2020,Making Python Programs Blazingly Fast,Let’s look at the performance of our Python pr...,3.4K,3,https://towardsdatascience.com/@martin.heinz,https://towardsdatascience.com/making-python-p...,5,Making Python Programs Blazingly Fast\n\nLet’s...,3.8K,
1,01/01/2020,Implementing a fully convolutional network (FC...,"A tutorial on building, training and…",543,4,https://towardsdatascience.com/@himanshurawlani,https://towardsdatascience.com/implementing-a-...,11,Understanding and implementing a fully convolu...,331,
2,01/01/2020,6 New Features in Python 3.8 for Python Newbies,Python Beginner,1.8K,4,https://towardsdatascience.com/@edenau,https://towardsdatascience.com/6-new-features-...,4,Python Beginner 6 New Features in Python 3.8 f...,1.7K,
3,01/01/2020,How to be fancy with Python,Python tricks that will make your life easier,1.7K,12,https://towardsdatascience.com/@dipam44,https://towardsdatascience.com/how-to-be-fancy...,5,How to be fancy with Python\n\nPython is cool....,625,
4,01/01/2020,From scratch to search: playing with your data...,One Pipeline to rule…,239,1,https://towardsdatascience.com/@stanislavprihoda,https://towardsdatascience.com/from-scratch-to...,9,From scratch to search: playing with your data...,88,


-------

### Adding the towards data science publications for all Null values as I still hadn't incorporated it to my scraper then.

In [9]:
df['publication'].fillna('towardsdatascience', inplace=True)

I also want to add the amount of followers each publication has, which my scraper didn't include, however since I will be dropping some publications entirely, I will do that further down the line.

---------

## Transforming claps and followers column to integers

Cleaning author column first

In [10]:
df[df.author_followers.str.contains('Followers')]['author_followers'].head()

10593     218 Followers
10594     135 Followers
10595     677 Followers
10596     945 Followers
10597    1.1K Followers
Name: author_followers, dtype: object

In [11]:
df.author_followers = df.author_followers.apply(lambda x: x.replace(' Followers', ''))      

In [12]:
def clean_integers(x):
    try:
        return float(x)
    
    except:
        x = float(x.replace('K', ''))
        
        return x*1000

In [13]:
df.claps = df.claps.apply(clean_integers)
df.author_followers = df.author_followers.apply(clean_integers)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76364 entries, 0 to 76363
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              76364 non-null  object 
 1   title             76363 non-null  object 
 2   subtitle          76364 non-null  object 
 3   claps             76364 non-null  float64
 4   responses         76364 non-null  int64  
 5   author_handle     76364 non-null  object 
 6   story_url         76364 non-null  object 
 7   reading_time      76364 non-null  int64  
 8   text              76361 non-null  object 
 9   author_followers  76364 non-null  float64
 10  publication       76364 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 6.4+ MB


-----

## Extracting author handle from author URL

In [15]:
df.author_handle[1]

'https://towardsdatascience.com/@himanshurawlani'

In [16]:
def extract_author(x):
    pattern = re.compile(r'(@.+)')
    result = re.findall(pattern, str(x))
    str_result = ''.join(result)
    return str_result

In [17]:
df.author_handle = df.author_handle.apply(extract_author)

In [18]:
df.head()

Unnamed: 0,date,title,subtitle,claps,responses,author_handle,story_url,reading_time,text,author_followers,publication
0,01/01/2020,Making Python Programs Blazingly Fast,Let’s look at the performance of our Python pr...,3400.0,3,@martin.heinz,https://towardsdatascience.com/making-python-p...,5,Making Python Programs Blazingly Fast\n\nLet’s...,3800.0,towardsdatascience
1,01/01/2020,Implementing a fully convolutional network (FC...,"A tutorial on building, training and…",543.0,4,@himanshurawlani,https://towardsdatascience.com/implementing-a-...,11,Understanding and implementing a fully convolu...,331.0,towardsdatascience
2,01/01/2020,6 New Features in Python 3.8 for Python Newbies,Python Beginner,1800.0,4,@edenau,https://towardsdatascience.com/6-new-features-...,4,Python Beginner 6 New Features in Python 3.8 f...,1700.0,towardsdatascience
3,01/01/2020,How to be fancy with Python,Python tricks that will make your life easier,1700.0,12,@dipam44,https://towardsdatascience.com/how-to-be-fancy...,5,How to be fancy with Python\n\nPython is cool....,625.0,towardsdatascience
4,01/01/2020,From scratch to search: playing with your data...,One Pipeline to rule…,239.0,1,@stanislavprihoda,https://towardsdatascience.com/from-scratch-to...,9,From scratch to search: playing with your data...,88.0,towardsdatascience


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76364 entries, 0 to 76363
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              76364 non-null  object 
 1   title             76363 non-null  object 
 2   subtitle          76364 non-null  object 
 3   claps             76364 non-null  float64
 4   responses         76364 non-null  int64  
 5   author_handle     76364 non-null  object 
 6   story_url         76364 non-null  object 
 7   reading_time      76364 non-null  int64  
 8   text              76361 non-null  object 
 9   author_followers  76364 non-null  float64
 10  publication       76364 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 6.4+ MB


-------

## Creating length of text column to filter out articles that weren't properly scraped due to the paywall

In [20]:
df['number_of_words'] = df.text.apply(lambda x: len(str(x).split()))
df['number_of_characters'] = df.text.apply(lambda x: len(str(x)))

In [21]:
df = df[df.number_of_words > 200]
df = df[~df.title.isna()]
df.reset_index(inplace=True, drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69200 entries, 0 to 69199
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  69200 non-null  object 
 1   title                 69200 non-null  object 
 2   subtitle              69200 non-null  object 
 3   claps                 69200 non-null  float64
 4   responses             69200 non-null  int64  
 5   author_handle         69200 non-null  object 
 6   story_url             69200 non-null  object 
 7   reading_time          69200 non-null  int64  
 8   text                  69200 non-null  object 
 9   author_followers      69200 non-null  float64
 10  publication           69200 non-null  object 
 11  number_of_words       69200 non-null  int64  
 12  number_of_characters  69200 non-null  int64  
dtypes: float64(2), int64(4), object(7)
memory usage: 6.9+ MB


-------

## Creating datetime object and extracting day of week and month

In [22]:
df['date_time'] = pd.to_datetime(df.date)

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl

In [23]:
df['day_of_the_week'] = df.date_time.apply(lambda x: x.weekday())
df['month'] = df.date_time.apply(lambda x: x.month)

Removing new datetime column as I don't need it anymore

In [24]:
df.day_of_the_week.value_counts()

0    11280
1    10906
2    10501
3    10374
4     9529
6     8556
5     8054
Name: day_of_the_week, dtype: int64

In [25]:
df.drop('date_time', axis=1, inplace=True)

-------

## English language detection
- To be completely certain that all texts are in english

In [29]:
language = []
for text in df['text']:
    language.append(detect(text))

In [36]:
for lang in language:
    if lang != 'en':
        print(lang)

----

## Cleaning text Column

### Lower casing text

In [26]:
df.text

0        Making Python Programs Blazingly Fast\n\nLet’s...
1        Understanding and implementing a fully convolu...
2        Python Beginner 6 New Features in Python 3.8 f...
3        How to be fancy with Python\n\nPython is cool....
4        From scratch to search: playing with your data...
                               ...                        
69195    The way ahead\n\nWherever I go these days, at ...
69196    The most beautiful theory of all\n\nA century ...
69197    Why people get happier as they get older\n\nAs...
69198    Is The Economist left- or right-wing?\n\nNeith...
69199    How the internet unleashed a burst of cartooni...
Name: text, Length: 69200, dtype: object

In [27]:
df['text'] = df['text'].apply(lambda x: " ".join(word.lower() for word in str(x).split()))

In [28]:
df['text']

0        making python programs blazingly fast let’s lo...
1        understanding and implementing a fully convolu...
2        python beginner 6 new features in python 3.8 f...
3        how to be fancy with python python is cool. re...
4        from scratch to search: playing with your data...
                               ...                        
69195    the way ahead wherever i go these days, at hom...
69196    the most beautiful theory of all a century ago...
69197    why people get happier as they get older as pe...
69198    is the economist left- or right-wing? neither....
69199    how the internet unleashed a burst of cartooni...
Name: text, Length: 69200, dtype: object

-------

### Remove stop words - Before and after removing punctuation as some stop words have punctuation

In [29]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [30]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [31]:
df['text'] = df['text'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

In [32]:
df['text']

0        making python programs blazingly fast let’s lo...
1        understanding implementing fully convolutional...
2        python beginner 6 new features python 3.8 pyth...
3        fancy python python cool. really cool. however...
4        scratch search: playing data (elasticsearch in...
                               ...                        
69195    way ahead wherever go days, home abroad, peopl...
69196    beautiful theory century ago albert einstein c...
69197    people get happier get older people age, gain ...
69198    economist left- right-wing? neither. consider ...
69199    internet unleashed burst cartooning creativity...
Name: text, Length: 69200, dtype: object

-------

### Remove punctuation and numbers

In [33]:
df['text'] = df['text'].str.replace('[^a-z ]', '')
df['text'] = df['text'].str.replace(r"\s\s+",' ') # this replaces two or more white spcaes with just one

  df['text'] = df['text'].str.replace('[^a-z ]', '')
  df['text'] = df['text'].str.replace(r"\s\s+",' ') # this replaces two or more white spcaes with just one


In [34]:
df['text']

0        making python programs blazingly fast lets loo...
1        understanding implementing fully convolutional...
2        python beginner new features python python new...
3        fancy python python cool really cool however l...
4        scratch search playing data elasticsearch inge...
                               ...                        
69195    way ahead wherever go days home abroad people ...
69196    beautiful theory century ago albert einstein c...
69197    people get happier get older people age gain s...
69198    economist left rightwing neither consider radi...
69199    internet unleashed burst cartooning creativity...
Name: text, Length: 69200, dtype: object

-------

### Remove stop words

In [35]:
df['text'] = df['text'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

In [36]:
len(pd.Series(" ".join(df['text']).split()).value_counts())

745284

In [37]:
len(pd.Series(" ".join(df['text']).split()))

48860083

In [38]:
pd.Series(" ".join(df['text']).split()).value_counts()[:40]

data         473027
one          242467
time         209104
use          197272
like         192220
model        184427
also         167276
get          154375
using        154328
need         151935
make         143100
people       139342
would        138726
new          138348
first        136429
work         128953
learning     121763
want         119998
see          118537
way          114720
used         113129
image        108839
many         107657
different    105770
dont         103775
example      101914
us            99217
even          98522
code          97534
well          97437
two           97154
know          94977
could         93368
value         92355
function      91753
create        91283
set           86751
good          86003
number        84120
may           82973
dtype: int64

-------

### Lemmatization

In [39]:
import nltk
from textblob import Word
#nltk.download()

In [40]:
df['text'] = df['text'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [41]:
df['text']

0        making python program blazingly fast let look ...
1        understanding implementing fully convolutional...
2        python beginner new feature python python newb...
3        fancy python python cool really cool however l...
4        scratch search playing data elasticsearch inge...
                               ...                        
69195    way ahead wherever go day home abroad people a...
69196    beautiful theory century ago albert einstein c...
69197    people get happier get older people age gain s...
69198    economist left rightwing neither consider radi...
69199    internet unleashed burst cartooning creativity...
Name: text, Length: 69200, dtype: object

In [42]:
len(pd.Series(" ".join(df['text']).split()).value_counts())

729122

------

### Sentiment Analyser

In [43]:
sentiment = df['text'].apply(lambda x: TextBlob(x).sentiment)
sentiment

0        (0.04971340388007055, 0.40773074661963543)
1        (0.11043365549179505, 0.39827091513138024)
2        (0.08111111111111112, 0.47074074074074074)
3         (0.20963642213642217, 0.4807861432861434)
4         (0.09084165122626665, 0.4359698800724437)
                            ...                    
69195    (0.08379030993161427, 0.38577157444005294)
69196      (0.1279141458955123, 0.4687573718551984)
69197    (0.06104942725313093, 0.41337938467568097)
69198     (0.11327855817651733, 0.3206224048060783)
69199    (0.11283705183705188, 0.46867395852011223)
Name: text, Length: 69200, dtype: object

In [44]:
polarity = []
subjectivity = []

for row in sentiment:
    polarity.append(row[0])
    subjectivity.append(row[1])

df['polarity'] = polarity
df['subjectivity'] = subjectivity

## Now I will do the same for the title and subtitle columns

In [1]:
def clean_text(df, column_name):
    # lower case
    df[column_name] = df[column_name].apply(lambda x: " ".join(word.lower() for word in str(x).split()))
    
    # remove punctuation
    df[column_name] = df[column_name].str.replace('[^a-z ]', '')
    
    # this replaces two or more white spcaes with just one
    df[column_name] = df[column_name].str.replace(r"\s\s+",' ') 
    
    # remove stop words
    df[column_name] = df[column_name].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
    
    # lemmatize
    df[column_name] = df[column_name].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))
    
    return df

In [45]:
# lower case
df['title'] = df['title'].apply(lambda x: " ".join(word.lower() for word in str(x).split()))
# remove punctuation
df['title'] = df['title'].str.replace('[^a-z ]', '')
df['title'] = df['title'].str.replace(r"\s\s+",' ') # this replaces two or more white spcaes with just one
# remove stop words
df['title'] = df['title'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
# lemmatize
df['title'] = df['title'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

  df['title'] = df['title'].str.replace('[^a-z ]', '')
  df['title'] = df['title'].str.replace(r"\s\s+",' ') # this replaces two or more white spcaes with just one


In [46]:
# lower case
df['subtitle'] = df['subtitle'].apply(lambda x: " ".join(word.lower() for word in str(x).split()))
# remove punctuation
df['subtitle'] = df['subtitle'].str.replace('[^a-z ]', '')
df['subtitle'] = df['subtitle'].str.replace(r"\s\s+",' ') # this replaces two or more white spcaes with just one
# remove stop words
df['subtitle'] = df['subtitle'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
# lemmatize
df['subtitle'] = df['subtitle'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

  df['subtitle'] = df['subtitle'].str.replace('[^a-z ]', '')
  df['subtitle'] = df['subtitle'].str.replace(r"\s\s+",' ') # this replaces two or more white spcaes with just one


In [47]:
df.head()

Unnamed: 0,date,title,subtitle,claps,responses,author_handle,story_url,reading_time,text,author_followers,publication,number_of_words,number_of_characters,day_of_the_week,month,polarity,subjectivity
0,01/01/2020,making python program blazingly fast,let look performance python program see,3400.0,3,@martin.heinz,https://towardsdatascience.com/making-python-p...,5,making python program blazingly fast let look ...,3800.0,towardsdatascience,1056,6175,2,1,0.049713,0.407731
1,01/01/2020,implementing fully convolutional network fcn t...,tutorial building training,543.0,4,@himanshurawlani,https://towardsdatascience.com/implementing-a-...,11,understanding implementing fully convolutional...,331.0,towardsdatascience,2527,15729,2,1,0.110434,0.398271
2,01/01/2020,new feature python python newbie,python beginner,1800.0,4,@edenau,https://towardsdatascience.com/6-new-features-...,4,python beginner new feature python python newb...,1700.0,towardsdatascience,712,4174,2,1,0.081111,0.470741
3,01/01/2020,fancy python,python trick make life easier,1700.0,12,@dipam44,https://towardsdatascience.com/how-to-be-fancy...,5,fancy python python cool really cool however l...,625.0,towardsdatascience,607,3328,2,1,0.209636,0.480786
4,01/01/2020,scratch search playing data elasticsearch inge...,one pipeline rule,239.0,1,@stanislavprihoda,https://towardsdatascience.com/from-scratch-to...,9,scratch search playing data elasticsearch inge...,88.0,towardsdatascience,2158,14087,2,1,0.090842,0.43597


-------

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69200 entries, 0 to 69199
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  69200 non-null  object 
 1   title                 69200 non-null  object 
 2   subtitle              69200 non-null  object 
 3   claps                 69200 non-null  float64
 4   responses             69200 non-null  int64  
 5   author_handle         69200 non-null  object 
 6   story_url             69200 non-null  object 
 7   reading_time          69200 non-null  int64  
 8   text                  69200 non-null  object 
 9   author_followers      69200 non-null  float64
 10  publication           69200 non-null  object 
 11  number_of_words       69200 non-null  int64  
 12  number_of_characters  69200 non-null  int64  
 13  day_of_the_week       69200 non-null  int64  
 14  month                 69200 non-null  int64  
 15  polarity           

Even though it says there are no nulls there are some columns with empty spaces

In [49]:
df = df[df.title != '']

In [50]:
len(df)

69188

In [51]:
df.subtitle = df.subtitle.apply(lambda x: '-' if x == '' else x)

-----

## Creating new target columns

### Creating new column of claps per quartile

In [52]:
def claps_per_quartile(x):
   
    low_q = df.claps.quantile(0.25)
    median = df.claps.quantile(0.5)
    high_q = df.claps.quantile(0.75)

    if x < low_q:
        return 0
    if low_q <= x < median:
        return 1
    if median <= x < high_q:
        return 2
    if high_q <= x:
        return 3

In [53]:
df['claps_per_quartile'] = df.claps.apply(claps_per_quartile)

### Creating new Binary column of Claps by Median

In [54]:
def claps_binary(x):

    median = df.claps.quantile(0.5)
    
    if x < median:
        return 0
    else:
        return 1

In [55]:
df['claps_binary'] = df.claps.apply(claps_binary)

In [56]:
df

Unnamed: 0,date,title,subtitle,claps,responses,author_handle,story_url,reading_time,text,author_followers,publication,number_of_words,number_of_characters,day_of_the_week,month,polarity,subjectivity,claps_per_quartile,claps_binary
0,01/01/2020,making python program blazingly fast,let look performance python program see,3400.0,3,@martin.heinz,https://towardsdatascience.com/making-python-p...,5,making python program blazingly fast let look ...,3800.0,towardsdatascience,1056,6175,2,1,0.049713,0.407731,3,1
1,01/01/2020,implementing fully convolutional network fcn t...,tutorial building training,543.0,4,@himanshurawlani,https://towardsdatascience.com/implementing-a-...,11,understanding implementing fully convolutional...,331.0,towardsdatascience,2527,15729,2,1,0.110434,0.398271,3,1
2,01/01/2020,new feature python python newbie,python beginner,1800.0,4,@edenau,https://towardsdatascience.com/6-new-features-...,4,python beginner new feature python python newb...,1700.0,towardsdatascience,712,4174,2,1,0.081111,0.470741,3,1
3,01/01/2020,fancy python,python trick make life easier,1700.0,12,@dipam44,https://towardsdatascience.com/how-to-be-fancy...,5,fancy python python cool really cool however l...,625.0,towardsdatascience,607,3328,2,1,0.209636,0.480786,3,1
4,01/01/2020,scratch search playing data elasticsearch inge...,one pipeline rule,239.0,1,@stanislavprihoda,https://towardsdatascience.com/from-scratch-to...,9,scratch search playing data elasticsearch inge...,88.0,towardsdatascience,2158,14087,2,1,0.090842,0.435970,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69195,01/01/2021,way ahead,-,2500.0,99,@potus44,https://medium.economist.com/the-way-ahead-65b...,13,way ahead wherever go day home abroad people a...,3800.0,uxplanet,2955,18433,4,1,0.083790,0.385772,3,1
69196,01/01/2021,beautiful theory,century ago albert einstein changed way human saw,4600.0,21,@the_economist,https://medium.economist.com/the-most-beautifu...,16,beautiful theory century ago albert einstein c...,334000.0,uxplanet,3953,24166,4,1,0.127914,0.468757,3,1
69197,01/01/2021,people get happier get older,people age gain spend life,316.0,11,@the_economist,https://medium.economist.com/why-people-get-ha...,11,people get happier get older people age gain s...,334000.0,uxplanet,2410,14495,4,1,0.061049,0.413379,3,1
69198,01/01/2021,economist left rightwing,neither consider radical centre,551.0,39,@the_economist,https://medium.economist.com/is-the-economist-...,3,economist left rightwing neither consider radi...,334000.0,uxplanet,572,3599,4,1,0.113279,0.320622,3,1


----

--------

## Saving to CSV

In [57]:
df.to_csv('df_final_no_numbers.csv', index=False)