# DataFrame Stemming

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('IBTimes.csv')
df2 = pd.read_csv('Hindu.csv')
df3 = pd.read_csv('IndiaToday.csv')

In [3]:
def remove_redun_rows(df, default_cols, cont_col_subset = ["Category", "Headline", "Summary", "Entire_News", "News_Link"], meta_col_subset = ["Date", "Time"]):
    """It removes faulty or duplicate rows
    If there are more columns in the given dataframe than default, this removes those rows that have such more columns. 
    If there are less columns in the given dataframe than default, it returns "None", thereby marking them unusable. 
    If there are duplicated content in any "cont_col_subset", it drops the extra rows except the latest entry.
    If there are rows with missing values for important columns, it removes such rows. 
    If there are rows with missing values for non-important columns, it ignores them, but informs there existence. 
    It returns the trimmed dataframe as the output in all cases except when the number of columns are less than default,
    and prints any missing values in non-important columns"""
    skip = False
    if list(df.columns) != list(default_cols):
        if len(df.columns) == len(default_cols):
            print("There seems to be some error in columns names")
        elif len(df.columns) < len(default_cols):
            print("The given DataFrame seems to have some missing columns")
            df = None # Marking the df useless
            skip = True # Skipping the DataFrame operations
        elif len(df.columns) > len(default_cols):
            print("The given DataFrame seems to have more columns than required")
            df_xtra_col_idx = df.loc[:, df.columns[len(default_cols):]].isnull().any(axis = 1)
            df = df.loc[df.index[df_xtra_col_idx], default_cols]
    if not skip:
        df = df.drop_duplicates(subset = cont_col_subset)
        df = df.dropna(subset = cont_col_subset+meta_col_subset)
        non_imp_cols = list(set(default_cols)-set(cont_col_subset+meta_col_subset))
        display(df.loc[df.loc[:, non_imp_cols].isnull().any(axis = 1), :])
    return df

In [17]:
frames=[df1, df2, df3]
Scraped_News=pd.concat(frames, ignore_index=True)

In [18]:
display(Scraped_News)

Unnamed: 0.1,Unnamed: 0,Date,Time,Category,Sub-Category,Headline,Entire_News,Author,News_Link
0,0,2022-03-14,16:50:19,society,fact-check,"Fact check: No, Ugandan cop didn't hit journal...",A Ugandan cop is creating a buzz on social me...,\nSami Khan ...,https://www.ibtimes.co.in/fact-check-no-uganda...
1,1,2022-02-21,21:54:03,society,fact-check,Fact check: Murdered Bajrang Dal activist not ...,The brutal murder of Bajrang Dal activist in ...,\nSami Khan ...,https://www.ibtimes.co.in/fact-check-murdered-...
2,2,2022-02-16,01:03:07,society,fact-check,Fact check: Nandan Nilekani isn't launching cr...,Discussion over cryptocurrency took prominenc...,\nSami Khan ...,https://www.ibtimes.co.in/fact-check-nandan-ni...
3,3,2022-02-09,02:12:06,society,fact-check,Fact check: Indian flag not replaced by saffro...,Shocking details are emerging from the widesp...,\nSami Khan ...,https://www.ibtimes.co.in/fact-check-indian-fl...
4,4,2022-02-08,18:20:45,society,fact-check,Fact check: Viral video falsely links J&K poli...,University of Kashmir students are protesting...,\nSami Khan ...,https://www.ibtimes.co.in/fact-check-viral-vid...
...,...,...,...,...,...,...,...,...,...
2377,424,"March 23, 2022",12:24 IST,science,,Ingenuity helicopter aces 22nd flight on Mars ...,The little helicopter that has surprised its m...,India Today Web Desk,https://www.indiatoday.in/science/story/ingenu...
2378,425,"March 23, 2022",11:55 IST,science,,What is GSAT-7B satellite that will boost Indi...,The Indian Army is set to get a long-awaited d...,India Today Web Desk,https://www.indiatoday.in/science/story/what-i...
2379,426,"March 23, 2022",10:41 IST,science,,Arctic missing ice that could have covered twi...,With climate change raging and impacts seen ac...,India Today Web Desk,https://www.indiatoday.in/science/story/climat...
2380,427,"March 22, 2022",17:14 IST,science,,"India not alone in facing heatwave, both of Ea...",With the climate crisis becoming rampant and e...,India Today Web Desk,https://www.indiatoday.in/science/story/earth-...


In [9]:
display(Scraped_News.info(), Scraped_News.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2382 entries, 0 to 2381
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2382 non-null   int64 
 1   Date          2382 non-null   object
 2   Time          2382 non-null   object
 3   Category      2382 non-null   object
 4   Sub-Category  2382 non-null   object
 5   Headline      2382 non-null   object
 6   Entire_News   2382 non-null   object
 7   Author        2381 non-null   object
 8   News_Link     2382 non-null   object
dtypes: int64(1), object(8)
memory usage: 167.6+ KB


None

Unnamed: 0.1,Unnamed: 0
count,2382.0
mean,462.314861
std,305.881143
min,0.0
25%,204.0
50%,410.5
75%,706.0
max,1146.0


In [10]:
non_redun_Scraped_News = remove_redun_rows(Scraped_News, default_cols = Scraped_News.columns)

KeyError: Index(['Summary'], dtype='object')

In [10]:
non_redun_Scraped_News

Unnamed: 0,Date,Time,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
0,28-Jan-21,7:35:55 AM,Sports,Badminton,Patient pays… almost: Srikanth tries to stay i...,Kidambi Srikanth gritted it out and though he ...,In Test cricket’s season of spectacular stubbo...,Shivani naik,https://indianexpress.com/article/sports/badmi...
1,27-Jan-21,4:51:08 PM,Sports,Badminton,BWF World Tour Finals: Fighting PV Sindhu lose...,This was PV Sindhu's 16th defeat to Tai Tzu Yi...,World champion shuttler P V Sindhu went down f...,Pti,https://indianexpress.com/article/sports/badmi...
2,27-Jan-21,8:30:22 AM,Sports,Badminton,"World Tour Finals Preview: PV Sindhu, recharge...",With the Indian having played more matches tha...,Carolina Marin (50 total) played 24 tournament...,Shivani naik,https://indianexpress.com/article/sports/badmi...
3,24-Jan-21,7:03:06 PM,Sports,Badminton,Satwiksairaj’s offence gets neutralised by sav...,Satwiksairaj Rankireddy uses big smash to kill...,One would have to be blind to not figure that ...,Shivani naik,https://indianexpress.com/article/sports/badmi...
4,23-Jan-21,7:58:44 PM,Sports,Badminton,Dream run of Indian doubles pairs end with sem...,"Up against the world number three Thai pair, S...",The Indian mixed doubles pair of Satwiksairaj ...,Pti,https://indianexpress.com/article/sports/badmi...
...,...,...,...,...,...,...,...,...,...
24882,"February 06, 2021",12:17 am IST,entertainment,television,"Bigg Boss 14 February 5, 2021, Written Update:...",Bigg Boss 14: Devoleena lashed out at Arshi fo...,"On the 125th day in the Bigg Boss house, Devol...",Aakanksha Raghuvanshi,https://www.ndtv.com/entertainment/bigg-boss-1...
24883,"February 03, 2021",12:23 pm IST,entertainment,television,A Tour Of Shaheer Sheikh And Ruchikaa Kapoor's...,Pictures from Shaheer and Ruchikaa's swanky ap...,"TV actors Shaheer Sheikh and Ruchikaa Kapoor, ...",Aakanksha Raghuvanshi,https://www.ndtv.com/entertainment/a-tour-of-s...
24884,"February 01, 2021",11:56 pm IST,entertainment,television,"Bigg Boss 14: Nikki Tamboli, Rahul Vaidya's Fi...","After a war of words, Nikki Tamboli agreed to ...",Monday's episode of Bigg Boss 14 was fulll of ...,Nilanjana Basu,https://www.ndtv.com/entertainment/bigg-boss-1...
24885,"January 29, 2021",2:28 pm IST,entertainment,television,Identify The Comedy Star In This Pic From 28 Y...,The young Kapil Sharma in the photo is barely ...,"Every now and then, actor-comedian Kapil Sharm...",Nilanjana Basu,https://www.ndtv.com/entertainment/heres-what-...


# Date and Time Reformatting 

In [11]:
import datetime

pub_dt_format = {"indianexpress": "%d-%b-%y %I:%M:%S %p" , "republicworld": "%d %B, %Y %H:%M IST", "dailymail": "%d/%m/%y %H:%M", "indiatoday": "%d-%b-%y %H:%M IST", "ibtimes": "%d-%b-%y %H:%M IST", "ndtv": "%B %d, %Y %I:%M %p IST"}

def dt_merge(df, input_cols = ["Date", "Time"], target_col = "Datetime"):
    if target_col not in df.columns:
        df.insert(0, target_col, df[input_cols].agg(" ".join, axis = 1))
        df = df.drop(["Date", "Time"], axis = 1)
    else:
        print("'{}' already exists in the given DataFrame".format(target_col))
    return df

def dt_reformat(ds, id_col = "News_Link", target_col = "Datetime", date_format_dict = pub_dt_format):
    """Note: This function should only be applied on the Corpus DataFrame by using df.apply(dt_reformat, axis = 1)"""
    for k, v in date_format_dict.items():
        if k in ds.loc[id_col]:
            dt_obj = pd.to_datetime(ds.loc[target_col], format=v, infer_datetime_format=True)
            iso_dttime = dt_obj.strftime("%Y-%m-%d %H:%M:%S")
            ds.loc[target_col] = iso_dttime
    return ds

In [12]:
non_redun_Scraped_News = dt_merge(non_redun_Scraped_News)

NameError: name 'non_redun_Scraped_News' is not defined

In [13]:
non_redun_Scraped_News

NameError: name 'non_redun_Scraped_News' is not defined

# Category Reformatting

In [14]:
def cat_reformat(df):
    df['Category'] = df['Category'].replace(['business'],'Business')
    df['Category'] = df['Category'].replace(['Business-news'],'Business')
    df['Category'] = df['Category'].replace(['India-news'],'India')
    df['Category'] = df['Category'].replace(['Sports-news'],'Sports')
    df['Category'] = df['Category'].replace(['sports'],'Sports')
    df['Category'] = df['Category'].replace(['tvshowbiz'],'Entertainment')
    df['Category'] = df['Category'].replace(['entertainment'],'Entertainment')
    df['Category'] = df['Category'].replace(['Television'],'Entertainment')
    df['Category'] = df['Category'].replace(['Entertainment-news'],'Entertainment')
    df['Category'] = df['Category'].replace(['Technology-news'],'Technology')
    df['Category'] = df['Category'].replace(['technology'],'Technology')
    df['Category'] = df['Category'].replace(['World-news'],'World')
    df['Category'] = df['Category'].replace(['news'],'News')
    df['Category'] = df['Category'].replace(['society'],'Society')
    df['Category'] = df['Category'].replace(['Data-intelligence-unit'],'News')
    df['Category'] = df['Category'].replace(['Movies'],'Entertainment')
    df['Category'] = df['Category'].replace(['Education-today'],'Education')
    df['Category'] = df['Category'].replace(['Cities'],'India')
    return df

In [15]:
non_redun_Scraped_News = cat_reformat(non_redun_Scraped_News)

NameError: name 'non_redun_Scraped_News' is not defined

In [16]:
non_redun_Scraped_News

NameError: name 'non_redun_Scraped_News' is not defined

# Further Exploration

In [17]:
non_redun_Scraped_News = non_redun_Scraped_News.apply(dt_reformat, axis = 1)

In [18]:
non_redun_Scraped_News

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
0,2021-01-28 07:35:55,Sports,Badminton,Patient pays… almost: Srikanth tries to stay i...,Kidambi Srikanth gritted it out and though he ...,In Test cricket’s season of spectacular stubbo...,Shivani naik,https://indianexpress.com/article/sports/badmi...
1,2021-01-27 16:51:08,Sports,Badminton,BWF World Tour Finals: Fighting PV Sindhu lose...,This was PV Sindhu's 16th defeat to Tai Tzu Yi...,World champion shuttler P V Sindhu went down f...,Pti,https://indianexpress.com/article/sports/badmi...
2,2021-01-27 08:30:22,Sports,Badminton,"World Tour Finals Preview: PV Sindhu, recharge...",With the Indian having played more matches tha...,Carolina Marin (50 total) played 24 tournament...,Shivani naik,https://indianexpress.com/article/sports/badmi...
3,2021-01-24 19:03:06,Sports,Badminton,Satwiksairaj’s offence gets neutralised by sav...,Satwiksairaj Rankireddy uses big smash to kill...,One would have to be blind to not figure that ...,Shivani naik,https://indianexpress.com/article/sports/badmi...
4,2021-01-23 19:58:44,Sports,Badminton,Dream run of Indian doubles pairs end with sem...,"Up against the world number three Thai pair, S...",The Indian mixed doubles pair of Satwiksairaj ...,Pti,https://indianexpress.com/article/sports/badmi...
...,...,...,...,...,...,...,...,...
24882,2021-02-06 00:17:00,Entertainment,television,"Bigg Boss 14 February 5, 2021, Written Update:...",Bigg Boss 14: Devoleena lashed out at Arshi fo...,"On the 125th day in the Bigg Boss house, Devol...",Aakanksha Raghuvanshi,https://www.ndtv.com/entertainment/bigg-boss-1...
24883,2021-02-03 12:23:00,Entertainment,television,A Tour Of Shaheer Sheikh And Ruchikaa Kapoor's...,Pictures from Shaheer and Ruchikaa's swanky ap...,"TV actors Shaheer Sheikh and Ruchikaa Kapoor, ...",Aakanksha Raghuvanshi,https://www.ndtv.com/entertainment/a-tour-of-s...
24884,2021-02-01 23:56:00,Entertainment,television,"Bigg Boss 14: Nikki Tamboli, Rahul Vaidya's Fi...","After a war of words, Nikki Tamboli agreed to ...",Monday's episode of Bigg Boss 14 was fulll of ...,Nilanjana Basu,https://www.ndtv.com/entertainment/bigg-boss-1...
24885,2021-01-29 14:28:00,Entertainment,television,Identify The Comedy Star In This Pic From 28 Y...,The young Kapil Sharma in the photo is barely ...,"Every now and then, actor-comedian Kapil Sharm...",Nilanjana Basu,https://www.ndtv.com/entertainment/heres-what-...


In [19]:
non_redun_Scraped_News.to_csv("Final_Clean_Corpus.csv")

In [20]:
non_redun_Scraped_News.describe()

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
count,22753,22753,22037,22753,22753,22753,22303,22753
unique,20611,13,141,22733,22674,22700,1184,22749
top,2021-03-13 09:00:00,Entertainment,bollywood,"Mist-shrouded churches, eerie shipwrecks and m...",Flipkart Sale on electronics is back and the w...,"PromotedListen to the latest songs, only on Ji...",Associated press television news,https://www.republicworld.com/india-news/law-a...
freq,10,4554,435,3,4,17,1845,2


In [21]:
remove_redun_rows(non_redun_Scraped_News, default_cols = non_redun_Scraped_News.columns, meta_col_subset=["Datetime"]).describe()

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
10724,2021-03-17 09:57:00,India,,Ambani bomb scare case: Here is what NIA told ...,The National Investigation Agency told the cou...,Specific inputs gathered by National Investiga...,Vidya,https://www.indiatoday.in/india/story/ambani-b...
10725,2021-03-17 09:38:00,India,,Former Union minister and BJP leader Dilip Gan...,Former Union minister and BJP leader Dilip Gan...,Former Union minister and BJP leader Dilip Gan...,India today web desk,https://www.indiatoday.in/india/story/former-u...
10726,2021-03-17 09:08:00,India,,Vaccine solidarity a major focus as PM Modi ho...,Vaccine solidarity was a major focus of the co...,India and Finland on Tuesday held “wide rangin...,Geeta mohan,https://www.indiatoday.in/india/story/india-pm...
10727,2021-03-17 08:46:00,India,,"No tradition of phone tapping in Rajasthan, sa...",BJP leaders on Tuesday demanded that a discuss...,Rajasthan Chief Minister Ashok Gehlot hit back...,Dev ankur wadhawan,https://www.indiatoday.in/india/story/no-tradi...
10728,2021-03-17 07:18:00,India,,"Delhi riots: HC grants bail to 4 men, question...",The Delhi High Court on Tuesday granted bail t...,The Delhi High Court Tuesday granted bail to f...,Press trust of india,https://www.indiatoday.in/india/story/delhi-ri...
...,...,...,...,...,...,...,...,...
15724,2020-01-19 15:39:00,Entertainment,Top-stories,Sushant Singh joins CAA protests in Shaheen Ba...,Sushant Singh joined the CAA protests in Shahe...,It is time for the youth and students to come ...,,https://www.indiatoday.in/television/top-stori...
15740,2019-12-02 20:13:00,Entertainment,Top-stories,Bigg Boss 13 to Naagin 4: Your must-see TV gui...,"From family drama to reality shows, check out ...",Indian TV shows have become an inevitable part...,,https://www.indiatoday.in/television/top-stori...
15769,2019-09-07 13:52:00,Entertainment,Top-stories,Protest erupts over telecast of TV show Ram Si...,Colors TV's Ram Siya Ke Luv Kush has allegedly...,Valmiki community of Punjab staged a protest o...,,https://www.indiatoday.in/television/top-stori...
15772,2019-09-04 12:49:00,Entertainment,Top-stories,Shiv Sena member accuses Netflix of defaming H...,Shiv Sena IT Cell member has alleged that Netf...,"Ramesh Solanki, a member of Shiv Sena IT Cell,...",,https://www.indiatoday.in/television/top-stori...


Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
count,22753,22753,22037,22753,22753,22753,22303,22753
unique,20611,13,141,22733,22674,22700,1184,22749
top,2021-03-13 09:00:00,Entertainment,bollywood,"Mist-shrouded churches, eerie shipwrecks and m...",Flipkart Sale on electronics is back and the w...,"PromotedListen to the latest songs, only on Ji...",Associated press television news,https://www.republicworld.com/india-news/law-a...
freq,10,4554,435,3,4,17,1845,2


In [22]:
temp_i_list = list()
for i in non_redun_Scraped_News.index:
    if non_redun_Scraped_News.loc[i, "Entire_News"].startswith("PromotedListen to the latest songs, only on"):
        print(i)
        temp_i_list.append(i)

24815
24816
24817
24819
24820
24821
24822
24823
24824
24825
24826
24827
24828
24830
24835
24837
24839


In [23]:
non_redun_Scraped_News.loc[temp_i_list, :]

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
24815,2017-06-02 07:56:00,Entertainment,hollywood,"Today's Big Release: Baywatch, Wonder Woman, A...",Movie-goers have ample choice this week betwee...,"PromotedListen to the latest songs, only on Ji...",Divya Goyal,https://www.ndtv.com/entertainment/todays-big-...
24816,2017-06-01 20:16:00,Entertainment,hollywood,Baywatch Preview: Priyanka Chopra's Hollywood ...,Baywatch: Priyanka plays the villainous charac...,"PromotedListen to the latest songs, only on Ji...",Nilanjana Basu,https://www.ndtv.com/entertainment/baywatch-pr...
24817,2017-06-01 17:57:00,Entertainment,hollywood,How Wonder Woman Cracked The Superhero Movie G...,"On set, Jenkins can be a physical director, de...","PromotedListen to the latest songs, only on Ji...","Michael Cavna, The Washington Post",https://www.ndtv.com/entertainment/how-wonder-...
24819,2017-05-25 17:54:00,Entertainment,hollywood,Pirates Of The Caribbean 5 Preview: Jack Sparr...,"In Pirates Of The Caribbean 5, Jack Sparrow's ...","PromotedListen to the latest songs, only on Ji...",Divya Goyal,https://www.ndtv.com/entertainment/pirates-of-...
24820,2017-05-25 17:10:00,Entertainment,hollywood,"In Ridley Scott's New Alien Movies, The Xenomo...","Alien: Covenant, like Prometheus before it, mi...","PromotedListen to the latest songs, only on Ji...","Sonny Bunch, The Washington Post",https://www.ndtv.com/entertainment/in-ridley-s...
24821,2017-05-24 20:27:00,Entertainment,hollywood,Priyanka Chopra Reveals Who Her 'Dream Co-Star...,Actress Priyanka Chopra is busy promoting her ...,"PromotedListen to the latest songs, only on Ji...",Tishya Misra,https://www.ndtv.com/entertainment/priyanka-ch...
24822,2017-05-24 16:44:00,Entertainment,hollywood,Spider-Man: Homecoming Trailer - Tom Holland S...,Spider-Man: Homecoming Trailer: Tom Holland re...,"PromotedListen to the latest songs, only on Ji...",Nilanjana Basu,https://www.ndtv.com/entertainment/spider-man-...
24823,2017-05-24 16:30:00,Entertainment,hollywood,"Brad Pitt Brings War Machine To Mumbai, Will C...","War Machine, made under the stewardship of Ani...","PromotedListen to the latest songs, only on Ji...",Rohit Khilnani,https://www.ndtv.com/entertainment/brad-pitt-b...
24824,2017-05-23 18:44:00,Entertainment,hollywood,Priyanka Chopra May Skip Baywatch Screening Sh...,Priyanka Chopra has asked her mother to take c...,"PromotedListen to the latest songs, only on Ji...",Divya Goyal,https://www.ndtv.com/entertainment/priyanka-ch...
24825,2017-05-23 18:54:00,Entertainment,hollywood,"Kabir Khan, Siddharth Anand To Direct Indo-Sin...","Of the two films, Kabir Khan will helm a movie...","PromotedListen to the latest songs, only on Ji...",Nilanjana Basu,https://www.ndtv.com/entertainment/kabir-khan-...


In [24]:
for i in non_redun_Scraped_News.loc[temp_i_list, "Entire_News"].tolist():
    print(i)
    print("-"*115)

PromotedListen to the latest songs, only on JioSaavn.com 
-------------------------------------------------------------------------------------------------------------------
PromotedListen to the latest songs, only on JioSaavn.com 
-------------------------------------------------------------------------------------------------------------------
PromotedListen to the latest songs, only on JioSaavn.com 
-------------------------------------------------------------------------------------------------------------------
PromotedListen to the latest songs, only on JioSaavn.com 
-------------------------------------------------------------------------------------------------------------------
PromotedListen to the latest songs, only on JioSaavn.com 
-------------------------------------------------------------------------------------------------------------------
PromotedListen to the latest songs, only on JioSaavn.com 
------------------------------------------------------------------------

In [25]:
temp_i_list = list()
for i in non_redun_Scraped_News.index:
    if non_redun_Scraped_News.loc[i, "Summary"].startswith("The state-run oil refiner Indian Oil Corpor"):
        print(i)
        temp_i_list.append(i)

24535
24554
24571
24589


In [26]:
for i in non_redun_Scraped_News.loc[temp_i_list, "Entire_News"].tolist():
    print(i)
    print("-"*115)

Petrol, Diesel Prices Today: Petrol and diesel prices remained unchanged across the four metros for 19 days in a row on Thursday, March 18. The state-run oil refiner Indian Oil Corporation had last raised fuel prices on February 27 to an all-time high of Rs 91.17 in the national capital. Since then, fuel rates have been steady across the four metro cities. As of now, the highest fuel prices are in Mumbai where petrol is at Rs 97.57 per litre and diesel at Rs 88.60 per litre. (Also Read: How To Find Latest Petrol, Diesel Rates In Your City) The oil marketing companies - Bharat Petroleum, Indian Oil Corporation and Hindustan Petroleum align the rates of domestic fuel with the global benchmarks, by taking into account any revisions in the foreign exchange rates. Any changes in fuel rates are implemented with effect from 6 am each day. The domestic petrol and diesel prices vary across states due to the value-added tax (VAT). PromotedListen to the latest songs, only on JioSaavn.com Meanwhil

In [27]:
non_redun_Scraped_News.loc[temp_i_list, :]

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
24535,2021-03-18 08:22:00,Business,latest,"No Change In Petrol, Diesel Rates On Thursday",The state-run oil refiner Indian Oil Corporati...,"Petrol, Diesel Prices Today: Petrol and diesel...",Peter Noronha,https://www.ndtv.com/business/petrol-diesel-pr...
24554,2021-03-17 08:09:00,Business,latest,"Petrol, Diesel Rates Steady For Eighteenth Con...",The state-run oil refiner Indian Oil Corporati...,"Petrol, Diesel Prices Today: Petrol and diesel...",Peter Noronha,https://www.ndtv.com/business/petrol-diesel-pr...
24571,2021-03-16 08:16:00,Business,latest,"Petrol, Diesel Rates Steady On Tuesday",The state-run oil refiner Indian Oil Corporati...,"Petrol, Diesel Prices Today: Petrol and diesel...",Peter Noronha,https://www.ndtv.com/business/petrol-diesel-pr...
24589,2021-03-15 07:58:00,Business,latest,"Petrol, Diesel Rates Unchanged On Monday",The state-run oil refiner Indian Oil Corporati...,"Petrol, Diesel Prices Today: Petrol and diesel...",Peter Noronha,https://www.ndtv.com/business/petrol-diesel-pr...


In [28]:
temp_i_list = list()
for i in non_redun_Scraped_News.index:
    if non_redun_Scraped_News.loc[i, "Headline"].startswith("Mist-shrouded churches, eerie shipwrecks and "):
        print(i)
        temp_i_list.append(i)

4697
4708
4709


In [29]:
for i in non_redun_Scraped_News.loc[temp_i_list, "Headline"].tolist():
    print(i)
    print("-"*115)

Mist-shrouded churches, eerie shipwrecks and magical doorways: The incredible winning images in the Historic Photographer of the Year 2020 contest
-------------------------------------------------------------------------------------------------------------------
Mist-shrouded churches, eerie shipwrecks and magical doorways: The incredible winning images in the Historic Photographer of the Year 2020 contest
-------------------------------------------------------------------------------------------------------------------
Mist-shrouded churches, eerie shipwrecks and magical doorways: The incredible winning images in the Historic Photographer of the Year 2020 contest
-------------------------------------------------------------------------------------------------------------------


In [30]:
non_redun_Scraped_News.loc[temp_i_list, :]

Unnamed: 0,Datetime,Category,Subcategory,Headline,Summary,Entire_News,Author,News_Link
4697,2020-12-22 19:17:00,News,syria,"Mist-shrouded churches, eerie shipwrecks and m...",This year's awards called on photographers to ...,These incredible pictures offer a window to hi...,By MailOnline Reporter,https://www.dailymail.co.uk/galleries/article-...
4708,2020-11-27 13:24:00,News,syria,"Mist-shrouded churches, eerie shipwrecks and m...",The winning and shortlisted images in the Hist...,These incredible pictures offer a window to hi...,By MailOnline Reporter,https://www.dailymail.co.uk/galleries/article-...
4709,2020-11-26 15:24:00,News,syria,"Mist-shrouded churches, eerie shipwrecks and m...",The winning and shortlisted images in the Hist...,These incredible pictures offer a window to hi...,By Jennifer Newton for MailOnline,https://www.dailymail.co.uk/travel/escape/arti...


# Exporting the Dataframe as CSV

In [33]:
non_redun_Scraped_News.to_csv("Final_Reformatted_Clean_Corpus.csv", index=False)