# Writing a class

## Create the Document class that will be used for text analysis

In [43]:
from collections import Counter

# Define Document class
class Document:
    """A class for text analysis
    
    :param text: string of text to be analyzed
    :ivar text: string of text to be analyzed; set by `text` parameter
    """
    # Method to create a new instance of MyClass
    def __init__(self, text):
        # Store text parameter to the text attribute
        self.text = text
        self.tokens = self._tokenize()
        self.word_counts = self._count_words()
    
    # Method for internal use only should be started with _
    # By defining methods as non-public 
    # you're signifying to the user that the method is only to be used inside the package.
    def _tokenize(self):
        return self.text.split(" ")
    
    # non-public method to tally document's word counts with Counter
    def _count_words(self):
        return Counter(self.tokens)

In [44]:
# Create an instance of Document with text
my_document = Document(text="Simple is better than complex. Complex is better than complicated.")

# Print the attributes of the Document instance
print(my_document.text)
print(my_document.tokens)
print(my_document.word_counts)

Simple is better than complex. Complex is better than complicated.
['Simple', 'is', 'better', 'than', 'complex.', 'Complex', 'is', 'better', 'than', 'complicated.']
Counter({'is': 2, 'better': 2, 'than': 2, 'Simple': 1, 'complex.': 1, 'Complex': 1, 'complicated.': 1})


In [46]:
# create a new document instance from datacamp_tweets
python_doc = Document("Special cases aren't special enough to break the rules. Although practicality beats purity. Errors should never pass silently.Special cases aren't special enough to break the rules. Although practicality beats purity. Errors should never pass silently.Special cases aren't special enough to break the rules. Although practicality beats purity. Errors should never pass silently.")

# print the first 5 tokens from datacamp_doc
print(python_doc.tokens[:5])

# print the top 5 most used words in datacamp_doc
print(python_doc.word_counts.most_common(5))

['Special', 'cases', "aren't", 'special', 'enough']
[('cases', 3), ("aren't", 3), ('special', 3), ('enough', 3), ('to', 3)]


# The DRY principle:

## DON'T REPEAT YOURSELF

### Instead of copy-pasting the already written functionality, use the principles of 'DRY' and inheritance to quickly create a new class.

In [54]:
# Define a SocialMedia class that is a child of the `Document class`
class SocialMedia(Document):
    def __init__(self, text):
        super().__init__(text)
        self.hashtag_counts = self._count_hashtags()
        self.mention_counts = self._count_mentions()
        
    def _count_hashtags(self):
        # Filter attribute so only words starting with '#' remain
        hashtags = [x for x in self.word_counts if x[0] == '#']
        return len(hashtags)
    
    def _count_mentions(self):
        # Filter attribute so only words starting with '@' remain
        mentions = [x for x in self.word_counts if x[0] == '@']
        return len(mentions)

In [57]:
social_media_analyzer = SocialMedia("#data #sql #python @anna @david")
print(s.word_counts)
print(s.hashtag_counts)
print(s.mention_counts)

Counter({'#data': 1, '#sql': 1, '#python': 1, '@anna': 1, '@david': 1})
3
2


In [60]:
# help() does not include private class methods
help(social_media_analyzer)

Help on SocialMedia in module __main__ object:

class SocialMedia(Document)
 |  SocialMedia(text)
 |  
 |  Method resolution order:
 |      SocialMedia
 |      Document
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, text)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from Document:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [62]:
# dir() shows all the class's methods
dir(social_media_analyzer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_count_hashtags',
 '_count_mentions',
 '_count_words',
 '_tokenize',
 'hashtag_counts',
 'mention_counts',
 'text',
 'tokens',
 'word_counts']

In [69]:
# Define a Tweet class that inherits from SocialMedia
class Tweets(SocialMedia):
    def __init__(self, text):
        # Call parent's __init__ with super()
        super().__init__(text)
        # Define retweets attribute with non-public method
        self.retweets = self._process_retweets()

    def _process_retweets(self):
        # Filter tweet text to only include retweets
        retweet_text = [x for x in self.text.split(" ") if x[0:2] == "RT"]
        # Return retweet_text as a SocialMedia object
        return SocialMedia(" ".join(retweet_text))

In [76]:
tweeter_text_analyzer = Tweets("RT12 RT34 @anna @david")
print(tweeter_text_analyzer.retweets.text)

RT12 RT34
