In [1]:
import nltk

In [2]:
paragraph = """At SpaceX, Musk oversees the development of rockets and spacecraft for missions to Earth orbit and ultimately to other planets. In 2008, SpaceX’s Falcon 9 rocket and Dragon spacecraft won the NASA contract to provide cargo transport to space. In 2012, SpaceX became the first commercial company to dock with the International Space Station and return cargo to Earth with the Dragon.

At Tesla, Musk has overseen product development and design from the beginning, including the all-electric Tesla Roadster, Model S and Model X, and the rollout of Supercharger stations to keep the cars juiced up. (Some of the charging stations use solar energy systems from SolarCity, of which Musk is the non-executive chair.) Transitioning to a sustainable energy economy, in which electric vehicles play a pivotal role, has been one of his central interests for almost two decades. Before this, he co-founded PayPal and served as the company's chair and CEO."""

In [3]:
#cleaning the texts
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [4]:
ps = PorterStemmer()
lem = WordNetLemmatizer()

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
sentences = nltk.sent_tokenize(paragraph)

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
#cleaning by using the stemming
new_sent = [] #new sentences will be appended to this emptylist
for i in range(len(sentences)):
  cleaning = re.sub('[^a-zA-z]',' ',sentences[i]) #removing the ,.'/ these symbols from the sentences by using the regular expressions
  cleaning = cleaning.lower() # making the small letters
  cleaning = cleaning.split() #split functions makes the every sentence will be splitted the return value is a list
  cleaning = [ps.stem(word) for word in cleaning if word not in set(stopwords.words('english'))] #aplling the stemming and removing the stop words like:  is, he, are etc; 
  cleaning = " ".join(cleaning) #again combining the words to form into sentences
  new_sent.append(cleaning) #appending into the empty list


In [13]:
new_sent

['spacex musk overse develop rocket spacecraft mission earth orbit ultim planet',
 'spacex falcon rocket dragon spacecraft nasa contract provid cargo transport space',
 'spacex becam first commerci compani dock intern space station return cargo earth dragon',
 'tesla musk overseen product develop design begin includ electr tesla roadster model model x rollout supercharg station keep car juic',
 'charg station use solar energi system solarc musk non execut chair',
 'transit sustain energi economi electr vehicl play pivot role one central interest almost two decad',
 'co found paypal serv compani chair ceo']

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [17]:
# cleaning by lemmatiation
new_sent_lem = []
for i in range(len(sentences)):
  clean_lem = re.sub('[^a-zA-z]',' ',sentences[i])
  clean_lem = clean_lem.lower()
  clean_lem = clean_lem.split()
  clean_lem = [lem.lemmatize(word) for word in clean_lem if word not in set(stopwords.words('english'))]
  clean_lem = " ".join(clean_lem)
  new_sent_lem.append(clean_lem)

In [18]:
new_sent_lem

['spacex musk oversees development rocket spacecraft mission earth orbit ultimately planet',
 'spacex falcon rocket dragon spacecraft nasa contract provide cargo transport space',
 'spacex became first commercial company dock international space station return cargo earth dragon',
 'tesla musk overseen product development design beginning including electric tesla roadster model model x rollout supercharger station keep car juiced',
 'charging station use solar energy system solarcity musk non executive chair',
 'transitioning sustainable energy economy electric vehicle play pivotal role one central interest almost two decade',
 'co founded paypal served company chair ceo']

In [21]:
#creating the bag of words model
#appliying the leammatization words to the bag of words model
from sklearn.feature_extraction.text import CountVectorizer #countvectorizer responsible for the making the histogram and sorting in a decending order  and matching the features
final_matrix = CountVectorizer().fit_transform(new_sent_lem).toarray() #fit transform is responsible for the below matrix creation

In [20]:
final_matrix

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,

In [22]:
#bag of words model for stemming words but using the lemmatization is preferable for the sentimental analysis 
from sklearn.feature_extraction.text import CountVectorizer
final_mat_stem = CountVectorizer().fit_transform(new_sent).toarray()

In [23]:
final_mat_stem

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,