In [43]:
# import required libraries
import spacy
import contractions
import requests
import re
import unicodedata

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from bs4 import BeautifulSoup

In [4]:
# import text file
with open('oriam.txt') as f:
    sample_text = f.read()

print(sample_text)

Oriam
From Wikipedia, the free encyclopedia
Jump to navigationJump to search
Oriam is Scotland's national performance centre for sport, based at Heriot-Watt University's Riccarton campus in Edinburgh. The Scottish Rugby Union and the Scottish Football Association use it as a training facility, Heart of Midlothian F.C. rent the centre for first-team training and to run their academy. It is also used by other sports teams, students and members of the public.


Contents
1	History
2	Sports
3	Facilities
4	See also
5	References
6	External links
History
A review of Scottish football, led by former First Minister of Scotland Henry McLeish, picked up on the lack of facilities in Scotland in the first report published in April 2010.[1] In February 2012, Sport Minister Shona Robison announced that Â£25 million from the Scottish Government's Young Scots Fund would be put towards a new multi-sports centre which would include a national football academy.[2] Universities, colleges and local authoriti

In [15]:
def clean_text(text):
    """Perform basic text pre-processing."""
    
    # tokenize text
    text_tokenized = word_tokenize(text)
    # remove duplicates
    text_no_dup = set(text_tokenized)
    # keep only alphanumeric tokens
    text_alpha = [w for w in text_no_dup if w.isalnum()]
    # lower case all tokens
    text_lower = [w.lower() for w in text_alpha]
    # remove trailing whitespace
    text_ws = [w.strip() for w in text_lower]
    # import stopwords list
    stop_words = stopwords.words('english')
    # keep words that are not defined as stopwords
    text_ns = [w for w in text_ws if w not in stop_words]
    # expand contractions
    text_nc = [contractions.fix(w) for w in text_ns]
    # return the cleaned text
    return text_nc
    
def lemmatize_text(text):
    """Lemmatize text."""
    
    # tokenize text
    tokens = word_tokenize(text)
    # expand contractions
    text_nc = [contractions.fix(w) for w in tokens]
    # POS-tag tokens
    tagged_tokens = pos_tag(text_nc)
    # convert tags to WordNet tags
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB,
               'n': wordnet.NOUN, 'r': wordnet.ADV}
    wn_tagged_tokens = [
        (word, tag_map.get(tag[0].lower(), wordnet.NOUN))
        for word, tag in tagged_tokens]
    # lemmatize text
    wnl = WordNetLemmatizer()
    lemmatized_text = " ".join(wnl.lemmatize(word, tag)
                               for word, tag in wn_tagged_tokens)
    
    return lemmatized_text

print("Text:\n{}".format(sample_text)[:150])
print("\nText Tokenized:\n{}".format(word_tokenize(sample_text)[:20]))
print("\nText Cleaned:\n{}\n".format(clean_text(sample_text)[:20]))
print("Text Lemmatized:\n{}".format(lemmatize_text(sample_text)[:152]))

Text:
Oriam
From Wikipedia, the free encyclopedia
Jump to navigationJump to search
Oriam is Scotland's national performance centre for sport, based at

Text Tokenized:
['Oriam', 'From', 'Wikipedia', ',', 'the', 'free', 'encyclopedia', 'Jump', 'to', 'navigationJump', 'to', 'search', 'Oriam', 'is', 'Scotland', "'s", 'national', 'performance', 'centre', 'for']

Text Cleaned:
['members', 'height', 'council', 'first', 'defied', 'scots', 'history', 'association', 'frame', 'free', 'hall', 'invited', 'also', '3500i', 'two', 'began', '9', 'chosen', 'references', 'announced']

[('Oriam', 'NN'), ('From', 'IN'), ('Wikipedia', 'NNP'), (',', ','), ('the', 'DT')]

 {'j': 'a', 'v': 'v', 'n': 'n', 'r': 'r'} 

[('Oriam', 'n'), ('From', 'n'), ('Wikipedia', 'n'), (',', 'n'), ('the', 'n'), ('free', 'a'), ('encyclopedia', 'n'), ('Jump', 'n'), ('to', 'n'), ('navigationJump', 'v'), ('to', 'n'), ('search', 'v'), ('Oriam', 'n'), ('is', 'v'), ('Scotland', 'n'), ("'s", 'n'), ('national', 'a'), ('performance', 'n'

In [17]:
# load pipeline
nlp = spacy.load('en_core_web_sm')

# create NLP object
text_spacy = nlp(sample_text)

# tokenize
tokens = [token for token in text_spacy]
print(tokens[:20])

[Oriam, 
, From, Wikipedia, ,, the, free, encyclopedia, 
, Jump, to, navigationJump, to, search, 
, Oriam, is, Scotland, 's, national]


In [29]:
data = requests.get("https://oriamscotland.com/")
                    
content = data.text

print(content[:100])

bs = BeautifulSoup(content, 'html.parser')

text = bs.get_text()

stripped_text = re.sub(r'[\r|\n|\r\n|\t]+', ' ', text)
stripped_text

<!doctype html>
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
<!-


"   Oriam - Oriam Scotland Scotland's National Sports Performance Centre                   Academic Facilities Gym Synthetic Pitches Sports Hall Conference Performance Wing Grass Pitches Members Area NEW MEMBERS Member Cancellation Form Sport Community Sport Student Sport Performance Sport Health & Fitness Inductions & Personal Training My Wellness and On Demand Exercise Classes Membership Options Membership Offers News About Us ORIAM FAQs Contact Us Bistro Partners Job Vacancies Terms & Conditions   Academic Facilities Gym Synthetic Pitches Sports Hall Conference Performance Wing Grass Pitches Members Area NEW MEMBERS Member Cancellation Form Sport Community Sport Student Sport Performance Sport Health & Fitness Inductions & Personal Training My Wellness and On Demand Exercise Classes Membership Options Membership Offers News About Us ORIAM FAQs Contact Us Bistro Partners Job Vacancies Terms & Conditions     MEMBERSHIP OFFERFind out more...    RE-OPENING INFO& FAQs    DOWNLOAD OUR NEW

In [35]:
rus_text = "Не́которые иностра́нцы ду́мают, что в Росси́и медве́ди хо́дят по у́лицам. Коне́чно, \
э́то непра́вда! Медве́ди живу́т в лесу́ и не лю́бят люде́й."

s = 'Sómě Áccěntěd těxt'

clean_text = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore').decode('utf-8', 'ignore')
clean_text

'Some Accented text'

In [48]:
text = 'I am the best'

nlp = spacy.load('en_core_web_sm')

text_spacy = nlp(text)

text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text_spacy])
text

'I be the good'