# Article Topic Analysis
## Overview
The purpose of this repository is to analyse,visualize,and list the topics included in a set of articles.<br>
These articles come from a dataset I have created over the last two years by collecting the URLs of articles of interest.
## Process
1. Data Extraction: Download text from each url (using BeautifuSoup)
2. Data Cleansing: Clean dataset with common NLP techniques (stopword removal, lemmatization, etc.) (using NLTK)
3. Saving Data: Saving the extracted dataset
4. Data Visualization: Visualize with PCA or T-SNE (using Scikit-learn)
5. 

In [1]:
# Imports
import csv

from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from urllib.request import urlopen
from bs4 import BeautifulSoup

from collections import defaultdict

## Data Extraction and Cleansing

In [2]:
# text extraction
def openUrlAndExtractText(url):
    site  = urlopen(url)
    soup = BeautifulSoup(site, "html.parser")
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    text = soup.getText()
    text = text.lower()
    
    return text
    
def extractSentencesFromText(text,minSizeOfSentence):
    sentences = text.split("\n")
    
    #remove short sentences
    sentences = [elem for elem in sentences if len(elem) >= minSizeOfSentence]
    
    extractedSentences = []
    #checking for multiple sentences within each line with nltk
    for sentence in sentences:
        extractedSentences.extend(sent_tokenize(sentence))
        
    return extractedSentences

def cleanSentences(sentences):
    cleanSentences = []
    for sentence in sentences:
        if len(sentence) <= 1:
            continue
        cleanSentences.append(sentence.strip())
        
    return cleanSentences

def turnSenteceListToWordCountDict(sentences):
    wordCountDict = defaultdict(int)
    for s in sentences:
        words = word_tokenize(s)
        for word in words:
            wordCountDict[word] += 1
    return wordCountDict


In [3]:
#loading dataset
def getDatasetURLList(fileLoc):
    dataUrls = []

    with open(fileLoc) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        for row in readCSV:
            dataUrls.append(row[0])
    return dataUrls



## NLP Cleansing

In [None]:
#nlp cleaning methods
def nlpCleaning(wordCountDict):
    #remove stop words
    wordCountDict = removeStopWords(wordCountDict)
    
    #lemmatize words
    wordCountDict = lemmatizeWordCountDict(wordCountDict)
    
    return wordCountDict

def removeStopWords(wordCountDict):
    cleanedWordCountDict = defaultdict(int)
    
    for word,count in wordCountDict.items():
        if word not in stopwords.words('english'):
            cleanedWordCountDict[word] = count
    
    return cleanedWordCountDict
        
def lemmatizeWordCountDict(wordCountDict):
    lemmatizedWordCountDict = defaultdict(int)
    
    lemmatizer = WordNetLemmatizer()
    
    for word,count in wordCountDict.items():
        lemmatizedWordCountDict[lemmatizer.lemmatize(word, get_wordnet_pos(word))] += count
    
    return lemmatizedWordCountDict

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Running Data extraction

In [None]:
#Input variables
dataSetCSVLocation = 'Data/Article_urls_KeepTransfer0_5.csv'
dataUrls = getDatasetURLList(dataSetCSVLocation)

minSizeOfSentence = 2

articlesPerSaveFile = 25

#other variables
wordCountDicts = []

successes = 0
errors = 0

#extracting the data from each url
for url in dataUrls:
    try:
        #extracting raw text from site
        rawText =  openUrlAndExtractText(url)
        #finding sentences in the text
        rawSentences = extractSentencesFromText(rawText,minSizeOfSentence)
        #cleaning sentences up
        cleanedSentences = cleanSentences(rawSentences)

        wordCountDict = turnSenteceListToWordCountDict(cleanedSentences)

        cleanedWordCountDict = nlpCleaning(wordCountDict)

        wordCountDicts.append(cleanedWordCountDict)
        successes += 1
        print(successes)
        if successes % articlesPerSaveFile == 0:
            with open('Data/Datafiles/'+successes+'.pickle', 'wb') as handle:
                pickle.dump(wordCountDicts, handle, protocol=pickle.HIGHEST_PROTOCOL)
                wordCountDicts = []
            print(successes, "saved")
    except:
        errors += 1
        print(url)
    
    
